# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.1350 -> 1.1397 # drivers/net/8139too.c 1.74 -> 1.75 # arch/sparc64/kernel/entry.S 1.29 -> 1.30 # include/asm-x86_64/irq.h 1.3 -> 1.4 # include/asm-x86_64/smp.h 1.12 -> 1.13 # include/asm-um/smp.h 1.3 -> 1.6 # net/llc/af_llc.c 1.51 -> 1.57 # arch/um/drivers/chan_user.c 1.5 -> 1.8 # drivers/pci/quirks.c 1.36 -> 1.37 # arch/um/drivers/mconsole_kern.c 1.6 -> 1.9 # arch/um/kernel/tt/tracer.c 1.3 -> 1.6 # arch/ia64/kernel/efi.c 1.26 -> 1.27 # include/linux/mm.h 1.133 -> 1.136 # arch/um/include/user.h 1.1 -> 1.4 # arch/um/kernel/ksyms.c 1.8 -> 1.11 # Documentation/networking/irda.txt 1.1 -> 1.2 # net/Kconfig 1.26 -> 1.28 # include/asm-ia64/io.h 1.17 -> 1.18 # arch/um/drivers/xterm.c 1.11 -> 1.14 # include/asm-um/processor-i386.h 1.2 -> 1.5 # net/ipv6/ip6_output.c 1.44 -> 1.46 # arch/um/drivers/port_kern.c 1.15 -> 1.18 # arch/um/kernel/trap_kern.c 1.7 -> 1.10 # net/llc/llc_conn.c 1.38 -> 1.39 # arch/um/kernel/tt/tlb.c 1.4 -> 1.7 # mm/readahead.c 1.38 -> 1.41 # include/asm-um/current.h 1.3 -> 1.6 # include/asm-sparc/unistd.h 1.26 -> 1.27 # arch/um/util/mk_constants_kern.c 1.1 -> 1.4 # include/asm-sparc/namei.h 1.1 -> 1.2 # net/llc/llc_proc.c 1.18 -> 1.20 # include/linux/init_task.h 1.27 -> 1.30 # include/linux/sched.h 1.173 -> 1.177 # drivers/net/wireless/airo.c 1.77 -> 1.79 # include/linux/writeback.h 1.23 -> 1.26 # drivers/media/video/bttv-if.c 1.16 -> 1.17 # include/net/tcp.h 1.51 -> 1.52 # arch/um/include/user_util.h 1.11 -> 1.14 # mm/vmscan.c 1.173 -> 1.176 # arch/um/drivers/mmapper_kern.c 1.5 -> 1.8 # arch/um/include/sysdep-i386/checksum.h 1.2 -> 1.5 # arch/um/kernel/ptrace.c 1.11 -> 1.14 # arch/x86_64/kernel/x8664_ksyms.c 1.18 -> 1.20 # arch/um/sys-i386/Makefile 1.15 -> 1.18 # net/ipv4/udp.c 1.53 -> 1.55 # include/asm-sparc64/unistd.h 1.25 -> 1.26 # arch/um/drivers/mconsole_user.c 1.2 -> 1.5 # arch/um/kernel/user_util.c 1.6 -> 1.9 # arch/alpha/kernel/setup.c 1.38 -> 1.39 # include/asm-um/processor-generic.h 1.6 -> 1.9 # arch/um/include/time_user.h 1.4 -> 1.7 # arch/um/kernel/tt/exec_kern.c 1.3 -> 1.6 # arch/um/kernel/time.c 1.8 -> 1.11 # include/asm-um/common.lds.S 1.2 -> 1.5 # include/asm-sparc64/spinlock.h 1.7 -> 1.8 # include/linux/serial.h 1.9 -> 1.10 # arch/x86_64/kernel/acpi/boot.c 1.6 -> 1.7 # arch/um/include/os.h 1.3 -> 1.6 # net/ipv6/xfrm6_policy.c 1.13 -> 1.14 # arch/um/kernel/smp.c 1.9 -> 1.12 # drivers/input/mouse/psmouse-base.c 1.34 -> 1.35 # include/linux/udp.h 1.7 -> 1.8 # drivers/serial/serial_core.c 1.72 -> 1.74 # drivers/net/ethertap.c 1.10 -> 1.11 # include/asm-i386/spinlock.h 1.10 -> 1.13 # include/asm-sparc64/namei.h 1.1 -> 1.2 # arch/um/drivers/stdio_console.c 1.12 -> 1.15 # arch/um/drivers/ubd_user.c 1.3 -> 1.6 # drivers/net/starfire.c 1.31 -> 1.33 # arch/um/kernel/skas/process_kern.c 1.5 -> 1.8 # arch/x86_64/kernel/smp.c 1.17 -> 1.18 # include/linux/preempt.h 1.6 -> 1.7 # fs/jfs/jfs_metapage.c 1.24 -> 1.25 # include/linux/fs.h 1.274 -> 1.277 # arch/um/kernel/sigio_kern.c 1.2 -> 1.5 # arch/i386/kernel/entry.S 1.69 -> 1.72 # include/linux/quotaops.h 1.14 -> 1.17 # arch/um/kernel/exec_kern.c 1.5 -> 1.8 # net/ipv4/tcp.c 1.49 -> 1.50 # arch/x86_64/kernel/setup64.c 1.17 -> 1.18 # include/linux/ip.h 1.10 -> 1.11 # fs/jbd/transaction.c 1.76 -> 1.80 # arch/um/kernel/skas/Makefile 1.7 -> 1.10 # arch/um/kernel/tt/uaccess_user.c 1.3 -> 1.6 # arch/h8300/kernel/time.c 1.3 -> 1.4 # net/ipv4/tcp_ipv4.c 1.72 -> 1.75 # include/asm-um/thread_info.h 1.4 -> 1.7 # include/asm-ia64/module.h 1.11 -> 1.12 # net/ipv4/netfilter/ipt_REDIRECT.c 1.6 -> 1.7 # mm/filemap.c 1.210.2.1 -> 1.214 # arch/um/kernel/sysrq.c 1.6 -> 1.9 # net/socket.c 1.69 -> 1.70 # arch/x86_64/kernel/entry.S 1.13 -> 1.14 # arch/um/include/kern_util.h 1.7 -> 1.10 # fs/binfmt_misc.c 1.22 -> 1.23 # net/core/sysctl_net_core.c 1.7 -> 1.8 # arch/um/kernel/mem.c 1.18 -> 1.21 # mm/mprotect.c 1.25 -> 1.28 # arch/um/kernel/skas/util/mk_ptregs.c 1.1 -> 1.4 # include/asm-x86_64/pci.h 1.11 -> 1.12 # lib/div64.c 1.2 -> 1.3 # arch/um/include/mconsole.h 1.3 -> 1.6 # drivers/media/video/tuner-3036.c 1.10 -> 1.11 # arch/ia64/kernel/module.c 1.8 -> 1.9 # arch/sparc64/kernel/systbls.S 1.48 -> 1.49 # arch/ia64/kernel/unwind_i.h 1.5 -> 1.6 # arch/um/kernel/skas/include/uaccess.h 1.5 -> 1.8 # drivers/net/sis900.c 1.46 -> 1.47 # arch/sparc64/lib/rwlock.S 1.1 -> 1.2 # fs/fat/inode.c 1.80 -> 1.81 # mm/memory.c 1.139.2.1 -> 1.143 # include/linux/llc.h 1.3 -> 1.4 # arch/um/uml.lds.S 1.16 -> 1.19 # drivers/usb/serial/digi_acceleport.c 1.40 -> 1.41 # arch/sparc/kernel/entry.S 1.16 -> 1.17 # arch/um/Makefile-skas 1.2 -> 1.5 # arch/sparc64/kernel/rtrap.S 1.16 -> 1.17 # arch/um/kernel/trap_user.c 1.9 -> 1.12 # arch/um/drivers/net_kern.c 1.15 -> 1.18 # arch/ia64/kernel/process.c 1.48 -> 1.49 # include/asm-um/bug.h 1.2 -> 1.5 # drivers/media/video/bttv-cards.c 1.22 -> 1.23 # net/ipv4/ipmr.c 1.32 -> 1.34 # arch/um/drivers/ubd_kern.c 1.35 -> 1.38 # arch/um/Kconfig 1.12 -> 1.15 # include/linux/spinlock.h 1.27 -> 1.30 # arch/sparc64/lib/dec_and_lock.S 1.4 -> 1.5 # arch/sparc/kernel/systbls.S 1.24 -> 1.25 # net/ipv6/ipcomp6.c 1.7 -> 1.8 # arch/um/drivers/ssl.c 1.9 -> 1.12 # net/core/dev.c 1.120 -> 1.122 # arch/x86_64/kernel/io_apic.c 1.16 -> 1.17 # net/xfrm/xfrm_policy.c 1.41 -> 1.43 # arch/um/sys-i386/fault.c 1.2 -> 1.5 # drivers/net/r8169.c 1.15 -> 1.16 # arch/um/dyn.lds.S 1.3 -> 1.6 # arch/um/kernel/process_kern.c 1.17 -> 1.20 # drivers/net/pcmcia/ibmtr_cs.c 1.19 -> 1.20 # drivers/media/video/meye.h 1.10 -> 1.11 # arch/um/kernel/mem_user.c 1.5 -> 1.8 # include/asm-ia64/unwind.h 1.7 -> 1.8 # drivers/acpi/ec.c 1.26 -> 1.27 # drivers/pnp/isapnp/core.c 1.43 -> 1.44 # arch/um/kernel/skas/include/mode.h 1.2 -> 1.5 # arch/um/include/ubd_user.h 1.1 -> 1.4 # include/linux/sysctl.h 1.52 -> 1.53 # drivers/net/3c527.c 1.16 -> 1.17 # arch/um/kernel/signal_kern.c 1.14 -> 1.17 # fs/Kconfig 1.38.1.1 -> 1.41 # arch/um/drivers/Makefile 1.10 -> 1.13 # include/asm-um/page.h 1.4 -> 1.7 # fs/direct-io.c 1.34 -> 1.35 # fs/buffer.c 1.215 -> 1.218 # fs/Makefile 1.59 -> 1.62 # drivers/char/sonypi.h 1.18 -> 1.19 # drivers/net/arm/ether1.c 1.15 -> 1.16 # arch/um/Kconfig_net 1.2 -> 1.5 # drivers/char/tty_io.c 1.126.2.1 -> 1.130 # MAINTAINERS 1.176 -> 1.177 # arch/um/kernel/tt/ptproxy/proxy.c 1.5 -> 1.8 # drivers/pcmcia/yenta_socket.c 1.49 -> 1.50 # net/compat.c 1.11 -> 1.12 # arch/um/kernel/tty_log.c 1.4 -> 1.7 # mm/mmap.c 1.92.2.1 -> 1.96 # arch/um/Makefile 1.26 -> 1.29 # arch/um/kernel/time_kern.c 1.9 -> 1.12 # mm/Makefile 1.24 -> 1.27 # include/asm-um/timex.h 1.2 -> 1.5 # drivers/net/tokenring/ibmtr.c 1.18 -> 1.19 # include/asm-i386/unistd.h 1.30 -> 1.33 # arch/um/kernel/tt/include/uaccess.h 1.3 -> 1.6 # arch/um/defconfig 1.8 -> 1.11 # arch/um/include/mem_user.h 1.4 -> 1.7 # Makefile 1.435.1.1 -> 1.439 # drivers/net/pcmcia/fmvj18x_cs.c 1.26 -> 1.27 # arch/um/kernel/Makefile 1.20 -> 1.23 # arch/um/Kconfig_block 1.2 -> 1.5 # arch/x86_64/kernel/time.c 1.25 -> 1.26 # net/ipv6/ah6.c 1.24 -> 1.25 # include/asm-x86_64/topology.h 1.5 -> 1.6 # include/linux/page-flags.h 1.42 -> 1.45 # include/linux/ipv6.h 1.12 -> 1.14 # include/asm-sparc64/hardirq.h 1.16 -> 1.17 # arch/um/kernel/syscall_kern.c 1.10 -> 1.13 # arch/um/include/sysdep-i386/sigcontext.h 1.3 -> 1.6 # arch/um/kernel/uaccess_user.c 1.1 -> 1.4 # arch/um/drivers/xterm_kern.c 1.4 -> 1.7 # arch/um/kernel/config.c.in 1.1 -> 1.4 # kernel/module.c 1.94 -> 1.95 # net/core/skbuff.c 1.31 -> 1.32 # net/xfrm/xfrm_state.c 1.35 -> 1.36 # drivers/usb/serial/Kconfig 1.10 -> 1.11 # arch/ia64/kernel/perfmon_mckinley.h 1.9 -> 1.10 # net/core/sock.c 1.28 -> 1.29 # drivers/media/video/bt832.c 1.3 -> 1.4 # net/ipv4/ip_gre.c 1.33 -> 1.34 # net/ipv4/tcp_input.c 1.46 -> 1.47 # Documentation/video4linux/meye.txt 1.7 -> 1.8 # drivers/media/video/saa5249.c 1.19 -> 1.20 # arch/um/include/line.h 1.6 -> 1.9 # net/ipv4/tcp_minisocks.c 1.42 -> 1.43 # include/asm-sparc/ioctl.h 1.2 -> 1.3 # arch/ia64/kernel/gate-data.S 1.1 -> 1.2 # fs/jfs/namei.c 1.34 -> 1.35 # net/llc/llc_sap.c 1.30 -> 1.31 # include/linux/socket.h 1.9 -> 1.10 # arch/um/sys-i386/bugs.c 1.3 -> 1.6 # arch/um/Makefile-i386 1.6 -> 1.9 # arch/um/kernel/um_arch.c 1.12 -> 1.15 # kernel/sched.c 1.220.2.1 -> 1.224 # net/ipv4/netfilter/ip_fw_compat_masq.c 1.11 -> 1.12 # include/asm-ia64/namei.h 1.2 -> 1.3 # drivers/input/keyboard/atkbd.c 1.38 -> 1.39 # net/llc/llc_input.c 1.33 -> 1.34 # include/linux/jbd.h 1.38 -> 1.41 # include/asm-um/archparam-i386.h 1.3 -> 1.6 # arch/um/kernel/irq.c 1.11 -> 1.14 # net/ipv6/udp.c 1.50 -> 1.53 # net/ipv4/netfilter/Kconfig 1.16 -> 1.17 # arch/x86_64/mm/numa.c 1.6 -> 1.7 # arch/sparc64/kernel/sparc64_ksyms.c 1.59 -> 1.60 # drivers/acpi/dispatcher/dsopcode.c 1.21 -> 1.22 # arch/um/kernel/tt/process_kern.c 1.8 -> 1.11 # include/asm-sparc64/ioctl.h 1.2 -> 1.3 # arch/um/kernel/sys_call_table.c 1.20 -> 1.23 # arch/um/kernel/process.c 1.10 -> 1.13 # net/ipv6/mcast.c 1.39 -> 1.40 # net/ipv4/ipcomp.c 1.16 -> 1.17 # drivers/net/arm/etherh.c 1.17 -> 1.18 # arch/sparc64/Kconfig 1.38 -> 1.39 # arch/ia64/kernel/irq.c 1.29 -> 1.30 # kernel/exit.c 1.117 -> 1.119 # net/ipv4/ah4.c 1.27 -> 1.28 # arch/um/drivers/line.c 1.19 -> 1.22 # arch/um/kernel/user_syms.c 1.3 -> 1.6 # include/asm-um/fixmap.h 1.2 -> 1.5 # arch/um/kernel/umid.c 1.6 -> 1.9 # include/asm-h8300/smplock.h 1.1 -> (deleted) # arch/x86_64/mm/extable.c 1.5 -> 1.6 # drivers/net/bonding/bond_main.c 1.46 -> 1.47 # net/ipv6/addrconf.c 1.71 -> 1.74 # arch/ia64/kernel/perfmon.c 1.64 -> 1.67 # include/linux/netdevice.h 1.64 -> 1.65 # mm/truncate.c 1.11 -> 1.14 # crypto/api.c 1.30 -> 1.31 # arch/um/kernel/skas/process.c 1.5 -> 1.8 # include/net/if_inet6.h 1.9 -> 1.10 # include/asm-x86_64/hw_irq.h 1.6 -> 1.7 # arch/um/config.release 1.2 -> 1.5 # arch/um/drivers/hostaudio_kern.c 1.3 -> 1.6 # arch/i386/kernel/sys_i386.c 1.13 -> 1.16 # drivers/pci/setup-bus.c 1.22 -> 1.23 # include/asm-x86_64/processor.h 1.24 -> 1.25 # drivers/net/arm/ether3.c 1.18 -> 1.19 # drivers/md/dm-table.c 1.24 -> 1.25 # arch/x86_64/mm/k8topology.c 1.5 -> 1.6 # arch/um/drivers/chan_kern.c 1.5 -> 1.8 # include/asm-um/irq.h 1.3 -> 1.6 # fs/inode.c 1.107.1.1 -> 1.110 # arch/um/os-Linux/file.c 1.7 -> 1.10 # arch/um/include/mem.h 1.2 -> 1.5 # include/asm-um/system-generic.h 1.3 -> 1.6 # arch/um/os-Linux/drivers/tuntap_user.c 1.3 -> 1.6 # arch/um/kernel/init_task.c 1.4 -> 1.7 # fs/fs-writeback.c 1.41 -> 1.44 # include/linux/in.h 1.7 -> 1.8 # include/asm-um/pgtable.h 1.12 -> 1.15 # (new) -> 1.1 include/asm-um/page.h~uml-summa.diff # (new) -> 1.1 arch/um/kernel/process_kern.c~uml-summa.diff # (new) -> 1.1 patches/pc/fsync_super.diff.pc # (new) -> 1.1 arch/um/drivers/xterm_kern.c~uml-summa.diff # (new) -> 1.1 include/linux/sched.h~fs_activation.diff # (new) -> 1.1 arch/um/kernel/time_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/tty_log.c~uml-summa.diff # (new) -> 1.3 patches/patches/uml-kill-irq_kern.h.diff.patch # (new) -> 1.9 patches/patches/fsync_super.diff.patch # (new) -> 1.1 patches/bin/rolled-up-patch # (new) -> 1.1 patches/bin/pushpatch # (new) -> 1.1 arch/um/include/user_util.h~uml-summa.diff # (new) -> 1.3 patches/pc/uml-summa.diff.pc # (new) -> 1.1 arch/um/kernel/signal_kern.c~uml-summa.diff # (new) -> 1.1 mm/readahead.c~export-page_cache_readahead.diff # (new) -> 1.1 arch/um/sys-i386/fault.c~uml-summa.diff # (new) -> 1.1 arch/um/uml.lds.S~uml-summa.diff # (new) -> 1.1 mm/truncate.c~truncate_mapping_pages_range.diff # (new) -> 1.1 patches/pc/uml-kill-cow.diff.pc # (new) -> 1.9 patches/patches/export-generic_forget_inode.diff.patch # (new) -> 1.1 patches/bin/fpatch # (new) -> 1.1 patches/pc/truncate_mapping_pages_range.diff.pc # (new) -> 1.1 arch/um/kernel/sigio_kern.c~uml-kill-irq_kern.h.diff # (new) -> 1.1 arch/um/include/ubd_user.h~uml-summa.diff # (new) -> 1.1 arch/um/include/os.h~uml-summa.diff # (new) -> 1.1 patches/bin/combine-applied # (new) -> 1.1 include/linux/writeback.h~sb_sync_inodes.diff # (new) -> 1.1 arch/um/kernel/tt/exec_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/syscall_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/defconfig~uml-summa.diff # (new) -> 1.1 fs/buffer.c~fsync_super.diff # (new) -> 1.9 patches/patches/export-remove_from_page_cache.diff.patch # (new) -> 1.3 patches/patches/uml-AUTOCONF_INCLUDED.diff.patch # (new) -> 1.1 kernel/sched.c~spinlock-owner.diff # (new) -> 1.1 include/asm-um/processor-i386.h~uml-summa.diff # (new) -> 1.1 fs/Makefile~reiser4-fs-Makefile.diff # (new) -> 1.1 include/asm-um/sections.h # (new) -> 1.1 arch/um/drivers/chan_user.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/skas/process.c~uml-summa.diff # (new) -> 1.1 patches/bin/touched-by-patch # (new) -> 1.1 include/asm-um/archparam-i386.h~uml-summa.diff # (new) -> 1.1 arch/um/include/sysdep-i386/sigcontext.h~uml-summa.diff # (new) -> 1.1 arch/um/drivers/mconsole_user.c~uml-summa.diff # (new) -> 1.1 patches/txt/reget-page-mapping.diff.txt # (new) -> 1.1 arch/um/kernel/mem.c~uml-summa.diff # (new) -> 1.1 patches/pc/spinlock-owner.diff.pc # (new) -> 1.1 patches/pc/uml-sched_clock.diff.pc # (new) -> 1.9 patches/patches/export-page_cache_readahead.diff.patch # (new) -> 1.1 patches/bin/poppatch # (new) -> 1.1 arch/um/config.release~uml-summa.diff # (new) -> 1.1 fs/Makefile~uml-summa.diff # (new) -> 1.1 patches/pc/uml-tty-init.diff.pc # (new) -> 1.1 patches/bin/new-kernel # (new) -> 1.4 patches/applied-patches # (new) -> 1.1 arch/um/drivers/mconsole_kern.c~uml-kill-irq_kern.h.diff # (new) -> 1.2 patches/patches/uml-asm-sections.diff.patch # (new) -> 1.1 arch/um/include/sysdep-i386/checksum.h~uml-summa.diff # (new) -> 1.1 arch/um/drivers/hostaudio_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/skas/include/uaccess.h~uml-summa.diff # (new) -> 1.5 patches/patches/uml-asm-local-h.diff.patch # (new) -> 1.1 arch/um/Makefile~uml-summa.diff # (new) -> 1.1 arch/um/kernel/sys_call_table.c~uml-summa.diff # (new) -> 1.1 arch/um/drivers/ubd_user.c~uml-kill-cow.diff # (new) -> 1.1 patches/pc/reiser4-fs-Kconfig.diff.pc # (new) -> 1.1 arch/um/drivers/ubd_user.c~uml-summa.diff # (new) -> 1.1 include/asm-i386/spinlock.h~spinlock-owner.diff # (new) -> 1.1 Makefile~all-sources.diff # (new) -> 1.1 include/asm-um/thread_info.h~uml-summa.diff # (new) -> 1.1 mm/Makefile~uml-summa.diff # (new) -> 1.1 patches/bin/tag-series # (new) -> 1.2 patches/pc/export-page_cache_readahead.diff.pc # (new) -> 1.1 patches/bin/import_patch # (new) -> 1.7 patches/series # (new) -> 1.1 arch/um/os-Linux/file.c~uml-summa.diff # (new) -> 1.1 patches/bin/p0-2-p1 # (new) -> 1.1 include/asm-um/pgtable.h~uml-summa.diff # (new) -> 1.1 include/asm-um/timex.h~uml-summa.diff # (new) -> 1.1 arch/um/Kconfig_block~uml-summa.diff # (new) -> 1.1 patches/pc/uml-AUTOCONF_INCLUDED.diff.pc # (new) -> 1.1 fs/hostfs/hostfs_kern.c # (new) -> 1.1 arch/um/defconfig~uml-kill-cow.diff # (new) -> 1.3 patches/patches/uml-kill-cow.diff.patch # (new) -> 1.1 arch/um/include/mem_user.h~uml-summa.diff # (new) -> 1.1 fs/hostfs/hostfs.h # (new) -> 1.1 fs/hppfs/Makefile # (new) -> 1.1 fs/inode.c~export-generic_forget_inode.diff # (new) -> 1.1 patches/patches/reget-page-mapping.diff.patch # (new) -> 1.1 patches/bin/added-by-patch # (new) -> 1.1 include/linux/jbd.h~fs_activation.diff # (new) -> 1.1 patches/bin/p_diff # (new) -> 1.1 patches/pc/export-remove_from_page_cache.diff.pc # (new) -> 1.5 patches/patches/uml-asm-cpufeature-h.diff.patch # (new) -> 1.1 arch/um/drivers/ssl.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/skas/include/mode.h~uml-summa.diff # (new) -> 1.1 patches/pc/reget-page-mapping.diff.pc # (new) -> 1.1 include/linux/fs.h~sb_sync_inodes.diff # (new) -> 1.1 arch/um/sys-i386/Makefile~uml-summa.diff # (new) -> 1.1 arch/um/os-Linux/drivers/tuntap_user.c~uml-summa.diff # (new) -> 1.9 patches/patches/all-sources.diff.patch # (new) -> 1.1 arch/um/kernel/irq.c~uml-kill-irq_kern.h.diff # (new) -> 1.1 arch/um/kernel/umid.c~uml-summa.diff # (new) -> 1.1 include/linux/mm.h~truncate_mapping_pages_range.diff # (new) -> 1.1 arch/um/kernel/config.c.in~uml-summa.diff # (new) -> 1.1 patches/pc/uml-asm-local-h.diff.pc # (new) -> 1.1 patches/pc/do_mmap2-fix.diff.pc # (new) -> 1.1 include/linux/init_task.h~fs_activation.diff # (new) -> 1.1 arch/um/kernel/irq.c~uml-summa.diff # (new) -> 1.1 include/asm-um/cpufeature.h # (new) -> 1.1 patches/pc/uml-asm-cpufeature-h.diff.pc # (new) -> 1.1 mm/memory.c~init_fixmap_vma.diff # (new) -> 1.1 arch/um/drivers/port_kern.c~uml-kill-irq_kern.h.diff # (new) -> 1.1 patches/pc/static-inline-quotaops.diff.pc # (new) -> 1.1 fs/jbd/transaction.c~fs_activation.diff # (new) -> 1.1 patches/bin/patchdesc # (new) -> 1.1 arch/um/drivers/mmapper_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/tt/include/uaccess.h~uml-summa.diff # (new) -> 1.9 patches/patches/spinlock-owner.diff.patch # (new) -> 1.1 include/asm-um/fixmap.h~uml-summa.diff # (new) -> 1.1 include/asm-um/processor-generic.h~uml-summa.diff # (new) -> 1.1 arch/um/include/line.h~uml-summa.diff # (new) -> 1.1 arch/um/drivers/net_kern.c~uml-kill-irq_kern.h.diff # (new) -> 1.1 arch/um/kernel/trap_user.c~uml-summa.diff # (new) -> 1.1 mm/mprotect.c~do_mmap2-fix.diff # (new) -> 1.1 arch/um/kernel/uaccess_user.c~uml-summa.diff # (new) -> 1.1 patches/bin/toppatch # (new) -> 1.1 patches/bin/cat-series # (new) -> 1.1 fs/hostfs/hostfs_user.c # (new) -> 1.1 arch/um/drivers/port_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/time.c~uml-export-in-ksyms.c.diff # (new) -> 1.1 patches/pc/uml-asm-module-i386.h.diff.pc # (new) -> 1.1 include/asm-um/local.h # (new) -> 1.1 patches/bin/rpatch # (new) -> 1.1 include/linux/page-flags.h~page-owner.diff # (new) -> 1.1 arch/um/drivers/ubd_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/drivers/xterm.c~uml-summa.diff # (new) -> 1.1 arch/um/Kconfig_net~uml-summa.diff # (new) -> 1.1 patches/bin/removed-by-patch # (new) -> 1.1 patches/bin/stripspace # (new) -> 1.1 arch/um/Kconfig_block~uml-kill-cow.diff # (new) -> 1.1 arch/um/drivers/stdio_console.c~uml-tty-init.diff # (new) -> 1.1 fs/Kconfig~reiser4-fs-Kconfig.diff # (new) -> 1.1 arch/um/drivers/xterm_kern.c~uml-kill-irq_kern.h.diff # (new) -> 1.1 patches/pc/uml-kill-irq_kern.h.diff.pc # (new) -> 1.1 arch/um/kernel/ptrace.c~uml-summa.diff # (new) -> 1.1 patches/bin/patchfns # (new) -> 1.1 include/linux/mm.h~do_mmap2-fix.diff # (new) -> 1.1 arch/um/util/mk_constants_kern.c~uml-summa.diff # (new) -> 1.1 patches/bin/inpatch # (new) -> 1.3 patches/patches/uml-sched_clock.diff.patch # (new) -> 1.1 include/asm-um/current.h~uml-summa.diff # (new) -> 1.1 patches/bin/pcpatch # (new) -> 1.1 include/linux/fs.h~export-generic_forget_inode.diff # (new) -> 1.9 patches/patches/static-inline-quotaops.diff.patch # (new) -> 1.1 include/linux/spinlock.h~spinlock-owner.diff # (new) -> 1.1 include/asm-um/irq.h~uml-summa.diff # (new) -> 1.1 mm/mmap.c~do_mmap2-fix.diff # (new) -> 1.1 arch/um/kernel/tt/process_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/Kconfig~uml-summa.diff # (new) -> 1.1 arch/um/include/time_user.h~uml-summa.diff # (new) -> 1.1 arch/um/drivers/line.c~uml-summa.diff # (new) -> 1.1 patches/bin/refpatch # (new) -> 1.1 arch/um/kernel/mem_user.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/process.c~uml-summa.diff # (new) -> 1.1 arch/um/sys-i386/bugs.c~uml-summa.diff # (new) -> 1.1 arch/i386/kernel/sys_i386.c~do_mmap2-fix.diff # (new) -> 1.1 include/asm-um/module-i386.h # (new) -> 1.1 arch/um/kernel/trap_kern.c~uml-summa.diff # (new) -> 1.1 patches/pc/sb_sync_inodes.diff.pc # (new) -> 1.9 patches/patches/truncate_mapping_pages_range.diff.patch # (new) -> 1.1 patches/bin/unitdiff.py # (new) -> 1.1 patches/bin/join-patch # (new) -> 1.1 arch/um/drivers/line.c~uml-kill-irq_kern.h.diff # (new) -> 1.1 mm/filemap.c~export-remove_from_page_cache.diff # (new) -> 1.1 arch/um/kernel/exec_kern.c~uml-summa.diff # (new) -> 1.9 patches/patches/uml-summa.diff.patch # (new) -> 1.1 arch/um/kernel/skas/Makefile~uml-summa.diff # (new) -> 1.1 arch/um/kernel/init_task.c~uml-summa.diff # (new) -> 1.1 arch/um/drivers/chan_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/tt/ptproxy/proxy.c~uml-summa.diff # (new) -> 1.1 include/asm-um/system-generic.h~uml-summa.diff # (new) -> 1.1 patches/bin/ptkdiff # (new) -> 1.1 arch/um/kernel/skas/util/mk_ptregs.c~uml-summa.diff # (new) -> 1.1 patches/bin/split-patch # (new) -> 1.3 patches/patches/uml-export-in-ksyms.c.diff.patch # (new) -> 1.1 arch/um/kernel/um_arch.c~uml-summa.diff # (new) -> 1.1 include/linux/quotaops.h~static-inline-quotaops.diff # (new) -> 1.1 patches/bin/rename-patch # (new) -> 1.9 patches/patches/sb_sync_inodes.diff.patch # (new) -> 1.1 arch/um/include/user.h~uml-summa.diff # (new) -> 1.1 include/asm-um/common.lds.S~uml-summa.diff # (new) -> 1.1 patches/bin/pstatus # (new) -> 1.1 patches/pc/i386-sys_reiser4.diff.pc # (new) -> 1.1 arch/um/drivers/ubd_kern.c~uml-kill-irq_kern.h.diff # (new) -> 1.1 include/asm-um/bug.h~uml-summa.diff # (new) -> 1.1 patches/pc/page-owner.diff.pc # (new) -> 1.1 arch/um/drivers/mconsole_kern.c~uml-summa.diff # (new) -> 1.1 patches/pc/uml-asm-sections.diff.pc # (new) -> 1.1 fs/fs-writeback.c~sb_sync_inodes.diff # (new) -> 1.1 patches/bin/combine-series # (new) -> 1.1 patches/pc/export-generic_forget_inode.diff.pc # (new) -> 1.1 patches/bin/cvs-take-patch # (new) -> 1.1 patches/bin/apatch # (new) -> 1.1 patches/bin/mpatch # (new) -> 1.1 arch/um/kernel/smp.c~uml-summa.diff # (new) -> 1.1 patches/pc/init_fixmap_vma.diff.pc # (new) -> 1.1 patches/bin/extract_description # (new) -> 1.1 fs/hostfs/Makefile # (new) -> 1.1 arch/um/include/mconsole.h~uml-summa.diff # (new) -> 1.9 patches/patches/fs_activation.diff.patch # (new) -> 1.1 arch/um/Makefile-i386~uml-summa.diff # (new) -> 1.1 arch/um/kernel/tt/tracer.c~uml-summa.diff # (new) -> 1.1 patches/bin/linus-patch # (new) -> 1.1 arch/um/kernel/sigio_kern.c~uml-summa.diff # (new) -> 1.1 patches/bin/prep-patch # (new) -> 1.1 arch/um/kernel/user_syms.c~uml-AUTOCONF_INCLUDED.diff # (new) -> 1.9 patches/patches/reiser4-fs-Makefile.diff.patch # (new) -> 1.9 patches/patches/init_fixmap_vma.diff.patch # (new) -> 1.1 fs/hppfs/hppfs_kern.c # (new) -> 1.1 arch/um/drivers/net_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/drivers/stdio_console.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/tt/tlb.c~uml-summa.diff # (new) -> 1.1 arch/um/include/mem.h~uml-summa.diff # (new) -> 1.9 patches/patches/do_mmap2-fix.diff.patch # (new) -> 1.1 patches/bin/export_patch # (new) -> 1.9 patches/patches/page-owner.diff.patch # (new) -> 1.1 arch/um/include/kern_util.h~uml-summa.diff # (new) -> 1.1 patches/pc/reiser4-fs-Makefile.diff.pc # (new) -> 1.1 arch/um/kernel/sysrq.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/Makefile~uml-summa.diff # (new) -> 1.2 patches/patches/uml-asm-module-i386.h.diff.patch # (new) -> 1.1 patches/pc/all-sources.diff.pc # (new) -> 1.1 arch/um/drivers/Makefile~uml-summa.diff # (new) -> 1.1 arch/i386/kernel/entry.S~i386-sys_reiser4.diff # (new) -> 1.1 patches/todo # (new) -> 1.1 drivers/char/tty_io.c~uml-tty-init.diff # (new) -> 1.3 patches/patches/uml-tty-init.diff.patch # (new) -> 1.1 arch/um/kernel/user_util.c~uml-summa.diff # (new) -> 1.1 patches/pc/fs_activation.diff.pc # (new) -> 1.1 arch/um/kernel/skas/process_kern.c~uml-summa.diff # (new) -> 1.1 arch/um/kernel/time_kern.c~uml-sched_clock.diff # (new) -> 1.1 arch/um/Makefile-skas~uml-summa.diff # (new) -> 1.1 arch/um/dyn.lds.S~uml-summa.diff # (new) -> 1.1 arch/um/kernel/ksyms.c~uml-export-in-ksyms.c.diff # (new) -> 1.10 patches/patches/reiser4-fs-Kconfig.diff.patch # (new) -> 1.1 patches/pc/uml-export-in-ksyms.c.diff.pc # (new) -> 1.1 arch/um/kernel/tt/uaccess_user.c~uml-summa.diff # (new) -> 1.9 patches/patches/i386-sys_reiser4.diff.patch # (new) -> 1.1 patches/bin/docco.txt # (new) -> 1.1 include/asm-i386/unistd.h~i386-sys_reiser4.diff # (new) -> 1.1 include/linux/mm.h~page-owner.diff # (new) -> 1.1 include/asm-um/smp.h~uml-summa.diff # (new) -> 1.1 arch/um/kernel/time.c~uml-summa.diff # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/10/24 ak@muc.de 1.1344.2.6 # [NET]: Limit SO_BSDCOMPAT warning. # -------------------------------------------- # 03/10/24 davem@nuts.ninka.net 1.1344.2.7 # [TCP]: Zero initial timestamps are valid, Windows XP emits these. # -------------------------------------------- # 03/10/24 davem@nuts.ninka.net 1.1351 # Merge nuts.ninka.net:/disk1/davem/BK/network-2.5 # into nuts.ninka.net:/disk1/davem/BK/net-2.5 # -------------------------------------------- # 03/10/24 davidm@tiger.hpl.hp.com 1.1337.45.17 # ia64: Fix/finish kernel module table support so it actually works. # -------------------------------------------- # 03/10/25 davem@nuts.ninka.net 1.1352 # [IPV4]: Fix typo in ipmr.c # -------------------------------------------- # 03/10/25 torvalds@home.osdl.org 1.1350.1.1 # Merge http://lia64.bkbits.net/to-linus-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/25 torvalds@home.osdl.org 1.1350.1.2 # Linux 2.6.0-test9 # -------------------------------------------- # 03/10/26 Andries.Brouwer@cwi.nl 1.1350.1.3 # [PATCH] atkbd: 0xfa is ACK # # The 0xfa code can be a key scancode or it can be a protocol scancode. # # Only few keyboards use it as a key scancode, and if we always interpret # it as a protocol scancode then these rare keyboards will have a dead # key. If we interpret it as a key scancode then we have a dead keyboard # in case it was protocol. # # Clearly it is safer to prefer to interpret it as a protocol scancode. # # This moves the test for ACK and NAK up, so that they are always seen as # protocol. # # This is just a minimal patch. What I did in 1.1.54 was to keep track of # commands sent with a flag reply_expected, so that 0xfa could be taken as # ACK when a reply is expected and as key scancode otherwise. That is the # better solution, but requires larger surgery. # -------------------------------------------- # 03/10/26 Andries.Brouwer@cwi.nl 1.1350.1.4 # [PATCH] Relax FATFS validity tests # # The first FAT entry should have the media byte (0xf0,0xf8,...,0xff) # extended with all 1 bits in the first FAT entry. # # Checking this is a good idea, it prevents us from mounting garbage # as FAT - there is no good magic for FAT. # # Unfortunately, Windows does not enforce this, and 2.4 doesn't either. # It turns out that there are filesystems around (two reports so far) that # have a zero first FAT entry, and work under Windows and 2.4 but fail to # mount under 2.6. # # So, this weakens the test. # -------------------------------------------- # 03/10/26 stelian@popies.net 1.1350.1.5 # [PATCH] sonypi: fix Zoom/Thumbphrase button events # # This corrects the Zoom and Thumbphrase button events. # -------------------------------------------- # 03/10/26 stelian@popies.net 1.1350.1.6 # [PATCH] meye: documentation # # This documents the existence of a forth 'motioneye' camera plugged into # the USB bus, of course unsupported by the meye driver. # -------------------------------------------- # 03/10/26 ak@muc.de 1.1350.1.7 # [PATCH] Essential x86-64 updates # # The most important part is that it makes x86-64 compile again. # Without that 2.6 users won't be very happy. # # It also works around a bug that allowed every user program to reboot the # system on B stepping K8. # # Also update to match some recent i386 fixes. # # Full ChangeLog: # - Add acpi_pic_set_level_irq to make ACPI compile again # - Work around compat mode K8 bug in IRET exception handling # - Increase exception stack. The old 1k stack was too easy # to overflow (from Jim Paradis, changed by me) # - Replace safe_smp_processor_id with cpuid (needed for above) # - When there is only one node always enable fake_node mode # - Merge with i386 (NTP gettimeofday monoticity fix, irq nr_vectors change) # - Fix compile problem for UP kernels in time/cpufreq # - Set all nodes online at bootup # - Define node_to_cpumask correctly # -------------------------------------------- # 03/10/26 ysato@users.sourceforge.jp 1.1350.1.8 # [PATCH] fix h8/300 support # # - add 'sched_clock' # - delete smplock.h # -------------------------------------------- # 03/10/26 tausq@debian.org 1.1350.1.9 # [PATCH] fix __div64_32 to do division properly # # This fixes the generic __div64_32() to correctly handle divisions by # large 32-bit values (as used by nanosleep() and friends, for example). # # It's a simple bit-at-a-time implementation with a reduction of the high # 32-bits handled manually. Architectures that can do 64/32-bit divisions # in hardware should implement their own more efficient versions. # -------------------------------------------- # 03/10/26 torvalds@home.osdl.org 1.1350.1.10 # Add a sticky "PF_DEAD" task flag to keep track of dead processes. # # Use this to simplify 'finish_task_switch', but perhaps more # importantly we can use this to track down why some processes # seem to sometimes not die properly even after having been # marked as ZOMBIE. The "task->state" flags are too fluid to # allow that well. # -------------------------------------------- # 03/10/27 matthias.andree@gmx.de 1.1350.2.1 # Properly terminate /proc/tty/driver/serial output lines of known UARTS # when the caller has no CAP_SYS_ADMIN capability. # -------------------------------------------- # 03/10/26 davem@nuts.ninka.net 1.1353 # Merge nuts.ninka.net:/disk1/davem/BK/network-2.5 # into nuts.ninka.net:/disk1/davem/BK/net-2.5 # -------------------------------------------- # 03/10/26 davem@nuts.ninka.net 1.1350.1.11 # Merge nuts.ninka.net:/disk1/davem/BK/sparcwork-2.5 # into nuts.ninka.net:/disk1/davem/BK/sparc-2.5 # -------------------------------------------- # 03/10/26 levon@movementarian.org 1.1354 # [NETFILTER]: Fix modular iptables build. # -------------------------------------------- # 03/10/26 ak@muc.de 1.1355 # [NET]: Fix oops in ethertap_rx(). # -------------------------------------------- # 03/10/26 yoshfuji@linux-ipv6.org 1.1356 # [IPV6]: Typo in address comparison. # -------------------------------------------- # 03/10/26 yoshfuji@linux-ipv6.org 1.1357 # [IPV6]: Use real storage for cork'd packets, else MSG_MORE corrupts UDP packets. # -------------------------------------------- # 03/10/26 yoshfuji@linux-ipv6.org 1.1358 # [IPV4,6]: Use common storage for cork'd flow, needed to handle mapped-ipv4 ipv6 addresses properly. # -------------------------------------------- # 03/10/27 yoshfuji@linux-ipv6.org 1.1359 # [IPV6]: Process ipv4-mapped addresses properly on UDPv6 sockets. # -------------------------------------------- # 03/10/27 rusty@rustcorp.com.au 1.1360 # [NETFILTER]: Fix ipchains oops in NAT # # We updated ip_nat_setup_info to set the initialized flag and call # place_in_hashes, but *didn't* change the call in ip_fw_compat_masq.c # which also calls place_in_hashes() itself (again!). Result: corrupt # list, and next thing which lands in the same hash bucket goes boom. # # Thanks to Andy Polyakov for chasing this down. # -------------------------------------------- # 03/10/27 yoshfuji@linux-ipv6.org 1.1361 # [IPV6]: Fix bogus semicolon typo in mcast.c # -------------------------------------------- # 03/10/27 bdschuym@pandora.be 1.1362 # [NETFILTER]: Fix potential OOPS in ipt_REDIRECT. # -------------------------------------------- # 03/10/27 davem@nuts.ninka.net 1.1363 # Revert signal handling changes in tcp.c - they break SIGURG. # # Cset exclude: kuznet@ms2.inr.ac.ru|ChangeSet|20031021052951|52463 # -------------------------------------------- # 03/10/27 davem@nuts.ninka.net 1.1364 # Revert "Zero initial timestamps are valid" changeset. # # I am still not sure that this change all by itself is enough # to make us accept zero initial timestamps properly. # # Cset exclude: davem@nuts.ninka.net|ChangeSet|20031025060257|60993 # -------------------------------------------- # 03/10/27 davem@nuts.ninka.net 1.1365 # [IPV6]: Do not virt_to_page() on stack addresses, fixes OOPS. # -------------------------------------------- # 03/10/27 herbert@gondor.apana.org.au 1.1366 # [IPSEC]: Fix accidental too many ref drops on policies. # -------------------------------------------- # 03/10/27 matthias.andree@gmx.de 1.1350.3.1 # Merge bk://linux.bkbits.net/linux-2.5/ # into gmx.de:/suse/kernel/BK/linux-2.5 # -------------------------------------------- # 03/10/27 torvalds@home.osdl.org 1.1350.4.1 # Put the compiler barrier() on the right side of the preemption # enable on UP-PREEMPT. # # Without this, the enable could "migrate" up into the critical # region (on SMP, the actual spinlock would act as an additional # barrier and PREEMPT was ok). # -------------------------------------------- # 03/10/27 kevcorry@us.ibm.com 1.1350.4.2 # [PATCH] Fix DM on top of raid # # Force Device-Mapper to use PAGE_SIZE or smaller I/O when the underlying # device has a bvec_merge_fn routine registered. This will fix the # situation of Device-Mapper submitting I/Os to RAID-0 that span the # RAID-0 chunk boundaries. # # Joe is working on a better solution that actually honors the MD # merge_bvec_fn routine. But this minimal change will fix the problem for # the time being. # -------------------------------------------- # 03/10/27 torvalds@home.osdl.org 1.1367 # Merge bk://kernel.bkbits.net/davem/net-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/27 torvalds@home.osdl.org 1.1368 # Merge bk://kernel.bkbits.net/davem/sparc-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/27 eranian@hpl.hp.com 1.1337.45.18 # [PATCH] ia64: fix perfmon UP breakage # # -------------------------------------------- # 03/10/27 davidm@tiger.hpl.hp.com 1.1350.5.1 # Merge tiger.hpl.hp.com:/data1/bk/vanilla/linux-2.6.0-test9 # into tiger.hpl.hp.com:/data1/bk/lia64/to-linus-2.5 # -------------------------------------------- # 03/10/27 torvalds@home.osdl.org 1.1369 # Fix ZOMBIE race with self-reaping threads. # # exit_notify() used to leave a window open when a thread # died that made the thread visible as a ZOMBIE even though # the thread reaped itself. This closes that window by marking # the thread DEAD within the tasklist_lock. # -------------------------------------------- # 03/10/27 torvalds@home.osdl.org 1.1370 # Don't force PS/2 mouse rate or resolution by default. # # Only set the rate/resolution if the user actually asked # for it. Some mice and KVM switches don't like to have # their rate forced. # -------------------------------------------- # 03/10/27 akpm@osdl.org 1.1371 # [PATCH] Fix binfmt_misc locking # # This fixes a sleep-in-spinlock bug for binfmt_misc registration. # # That lock is purely for the list, not for the dentry. # -------------------------------------------- # 03/10/27 rmk@flint.arm.linux.org.uk 1.1350.6.1 # [PCMCIA] Fix card detection. # # Idea from David Hinds. # # Some PCMCIA/Cardbus controllers seem to get upset when we ask # them to re-do card interrogation - they miss the next insertion # event. # # We therefore avoid forcing needless card interrogations if a # card has already been succesfully detected and interrogated. # -------------------------------------------- # 03/10/27 eranian@hpl.hp.com 1.1350.5.2 # [PATCH] ia64: fix 2 more perfmon2 bugs # # Here is the minimal patch that fixes things that do not work and that # can be noticed fairly easily: # # - remove a typo in pfm_check_task_state() which causes # PFM_READ_PMDS to fail when context is in PFM_MASKED state. # # - fix a typo in perfmon_mcklinley.h when checking the value # combinations for when writing to PMC14. This could reject a # valid request to program PMC14. # -------------------------------------------- # 03/10/28 acme@conectiva.com.br 1.1372 # [LLC]: Fix array indexing in llc_add_pack(). # -------------------------------------------- # 03/10/28 acme@conectiva.com.br 1.1373 # [LLC]: In llc_ui_connect(), return error properly when device not found. # -------------------------------------------- # 03/10/28 pee@erkkila.org 1.1374 # [IPV4]: Make sure ipgre_tunnel_init() gets the correct ioctl settings. # -------------------------------------------- # 03/10/28 andrew@com.rmk.(none) 1.1350.7.1 # [SERIAL PATCH] 1672/1: Restore sizeof(struct serial_struct) # # Patch from SAN People # # Patch 2.4.21-rmk1 added a "iomap_base" field to the serial_struct # structure (include/linux/serial.h). # # Since that structure is exported to user-space it should be # consistent between revisions of the stable 2.4 kernels. # # This patch removes 4 bytes (were "reserved") to restore the size # of the structure. # # Without this patch, ioctl(TIOCGSERIAL) will copy_to_user() 4 # bytes more than expected and possibly corrupt the application's # stack/heap. # -------------------------------------------- # 03/10/28 davem@nuts.ninka.net 1.1371.1.1 # [SPARC]: Add AIO syscalls, 32-bit compat handling will come later. # -------------------------------------------- # 03/10/28 davem@nuts.ninka.net 1.1371.1.2 # [SPARC64]: Fix preempt handling in dec_and_lock.S # -------------------------------------------- # 03/10/28 yoshfuji@linux-ipv6.org 1.1375 # [IPV6]: Fix inappropriate usage of inet{,6}_sk(). # -------------------------------------------- # 03/10/28 yoshfuji@linux-ipv6.org 1.1376 # [IPV4]: Remove out-of-date info CONFIG_INET help text. # -------------------------------------------- # 03/10/28 matthias.andree@gmx.de 1.1371.2.1 # Merge bk://linux.bkbits.net/linux-2.5/ # into gmx.de:/suse/kernel/BK/linux-2.5 # -------------------------------------------- # 03/10/28 yoshfuji@linux-ipv6.org 1.1377 # [IPV6]: Fix outdated and inaccurate information in Kconfig help. # -------------------------------------------- # 03/10/28 davem@nuts.ninka.net 1.1371.1.3 # [SPARC64]: Get preempt building and working again. # # - HAVE_DEC_LOCK depends on SMP # - Trap return preemption check needs interrupt disabled check # - Implement write_trylock # - Fix in_atomic() definition when PREEMPT enabled # -------------------------------------------- # 03/10/28 kml@patheticgeek.net 1.1378 # [TCP]: When SYN is set, the window is not scaled. # -------------------------------------------- # 03/10/28 Jay.Estabrook@hp.com 1.1371.3.1 # [PATCH] Fix alpha "white box" boot # # Here's a show-stopper patch for Alpha; missing it prevents several of # our platforms ("white box" 3000 and 5000 series) from booting. # -------------------------------------------- # 03/10/28 len.brown@intel.com 1.1371.4.1 # [ACPI] REVERT acpi_ec_gpe_query(ec) fix that crashed non-T40 boxes # http://bugme.osdl.org/show_bug.cgi?id=1171 # -------------------------------------------- # 03/10/29 matthias.andree@gmx.de 1.1371.2.2 # Merge bk://linux.bkbits.net/linux-2.5/ # into gmx.de:/suse/kernel/BK/linux-2.5 # -------------------------------------------- # 03/10/28 len.brown@intel.com 1.1371.4.2 # [ACPI] REVERT ACPICA-20030918 CONFIG_ACPI_DEBUG printk that caused crash # http://bugzilla.kernel.org/show_bug.cgi?id=1341 # -------------------------------------------- # 03/10/28 torvalds@home.osdl.org 1.1371.1.4 # Merge bk://kernel.bkbits.net/davem/sparc-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/28 torvalds@home.osdl.org 1.1379 # Merge bk://kernel.bkbits.net/davem/net-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/28 torvalds@home.osdl.org 1.1380 # Merge http://lia64.bkbits.net/to-linus-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/28 davem@nuts.ninka.net 1.1381 # [NET/COMPAT]: Fix copying of ipt_entry objects in do_netfilter_replace(). # # As noted by Georg Chini, ipt_entry object are of variable size # so just copying individual struct ipt_entry slots around does # not work. # -------------------------------------------- # 03/10/28 janitor@sternwelten.at 1.1382 # [NETFILTER]: Add IPCHAINS to MAINTAINERS entry. # -------------------------------------------- # 03/10/28 acme@conectiva.com.br 1.1383 # [LLC]: Fix oops in procf handling. # -------------------------------------------- # 03/10/29 rmk@flint.arm.linux.org.uk 1.1380.1.1 # Merge flint.arm.linux.org.uk:/usr/src/bk/linux-2.6 # into flint.arm.linux.org.uk:/usr/src/bk/linux-2.6-serial # -------------------------------------------- # 03/10/29 rmk@flint.arm.linux.org.uk 1.1380.1.2 # Merge bk://129.217.163.1/linux-2.5/ # into flint.arm.linux.org.uk:/usr/src/bk/linux-2.6-serial # -------------------------------------------- # 03/10/29 phillim2@comcast.net 1.1380.2.1 # [PATCH] ibmtr_cs/ibmtr - get working again # # Patch to get ibmtr_cs / ibmtr working again. A change went in a while back # I missed that killed it. Also fixed the timer to eliminate the # uninitialized timer error on close. # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.2 # [PATCH] digi_accelport warning fix # # Use the correct type for the workqueue callback. # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.3 # [PATCH] JBD: use-after-free fix # # The wait_event() in there can touch the memory at *transaction after # kjournald has freed it. # # Rework the code to not wait until the transaction enters T_FLUSH state: just # loop back and try against after the wakeup. # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.4 # [PATCH] WinTV-D patch to make tuner functional # # From: "Brad House" # # Quick patch to enable the Philips tuner on the WinTV-D boards. Tested and # works fine. (acked by Gerd) # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.5 # [PATCH] bttv jiffies warning fix # # Use unsigned long for time_after(), not an int. # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.6 # [PATCH] Export some symbols on x86-64 # # From: Andi Kleen # # Export two symbols on x86-64. This is needed for the sk98lin driver and ipv6. # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.7 # [PATCH] /proc/tty/driver/serial formatting fix # # From: Matthias Andree # # Properly terminate /proc/tty/driver/serial output lines of known UARTS # when the caller has no CAP_SYS_ADMIN capability. # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.8 # [PATCH] direct-io typo fix # # From: Klaas de Waal # # Bug in parameter of ZERO_PAGE macro in line 679 of fb/direct-io.c Parameter # dio->cur_user_address has to be dio->curr_user_address. This bug shows # when compling for MIPS little endian as target, not when compiling for X86. # -------------------------------------------- # 03/10/29 torvalds@home.osdl.org 1.1380.2.9 # Merge http://linux-acpi.bkbits.net/linux-acpi-release-2.6.0 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.10 # [PATCH] sis900 skb free fix # # This driver is freeing skb's from timer context, with local irq's disabled. # # It generates warnings from local_bh_enable() because local_bh_enable() # reenables interrupts, exposing the machine to deadlocks. # # So use the deferred dev_kfree_skb_irq() instead. # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.11 # [PATCH] initcall ordering fix for PNP NICs # # From: "M.H.VanLeeuwen" # # The level of isapnp_init was moved to after apci sometime ago. Since it is # now after net_dev_init, ISA PNP NICs fail to initialized at boot. This is # particularily problematic for NFS root filesystems like mine, or none # modular systems. # # This fix allows ISA PNP NIC cards to work during net_dev_init, and still # leaves isapnp_init after apci_init. # -------------------------------------------- # 03/10/29 acme@conectiva.com.br 1.1384 # [LLC]: llc_lookup_listener has to consider the 'any' mac address # -------------------------------------------- # 03/10/29 acme@conectiva.com.br 1.1385 # [LLC]: fix net_device refcounting bug # -------------------------------------------- # 03/10/29 acme@conectiva.com.br 1.1386 # [LLC]: fix bug that prevented fcntl(O_NONBLOCK) from working with PF_LLC sockets # -------------------------------------------- # 03/10/29 arjanv@redhat.com 1.1380.2.12 # [PATCH] r8169 module license tag # -------------------------------------------- # 03/10/29 acme@conectiva.com.br 1.1387 # [LLC]: set local mac addr at connect time when userland left it as zeroes # -------------------------------------------- # 03/10/29 tsk@ibakou.com 1.1380.2.13 # [netdrvr 8139too] add pci id # -------------------------------------------- # 03/10/29 riel@surriel.com 1.1380.2.14 # [netdrvr starfire] include asm/io.h # # Fixes build on some platforms. # -------------------------------------------- # 03/10/29 achirica@telefonica.net 1.1380.2.15 # [PATCH] Fix compatibily issue with some APs # -------------------------------------------- # 03/10/29 rmk@arm.linux.org.uk 1.1380.2.16 # [PATCH] 2.6.0-test8: fix ARM ether driver naming # # Ensure that arm ether drivers print the correct ether device name rather # than "eth%d". # -------------------------------------------- # 03/10/29 komujun@nifty.com 1.1380.2.17 # [pcmcia fmvj18x_cs] share interrupts properly for TDK multifunction cards. # -------------------------------------------- # 03/10/29 amir.noam@intel.com 1.1380.2.18 # [netdrvr bonding] fix monitoring functions # # This fix got missed in the bonding patchset applied a while ago. # -------------------------------------------- # 03/10/29 kolya@mit.edu 1.1388 # [NET]: Allow SOMAXCONN to be adjusted via sysctl. # -------------------------------------------- # 03/10/29 acme@conectiva.com.br 1.1389 # [NET]: Introduce dev_getbyfirsthwtype. # -------------------------------------------- # 03/10/29 acme@conectiva.com.br 1.1390 # [LLC]: when the user doesn't specifies a local address to connect, do an autobind # # Other protocols do this as soon as they discover over what interface the # packet will be routed, but LLC isn't routable, so, to provide similar # semantics to the other protocols, I'm just binding it to the first interface # of the type specified, perhaps we'll need a tunable for this or some sort of # routing table done manually by the admin, later we'll see, for now this # allows an application like openssh, with patched getaddrinfo/getnameinfo to # use PF_LLC sockets with a very small patch. # -------------------------------------------- # 03/10/29 ebrower@usa.net 1.1380.3.1 # [SPARC]: Fix _IOC_SIZE() macro when direction is _IOC_NONE. # -------------------------------------------- # 03/10/29 arjanv@redhat.com 1.1380.2.19 # [PATCH] fix starfire 64-bit b0rkage # # (x >> 32) is undefined on a 32 bit integral variable in C; In contrast # (x >>16 >> 16) is fine (and gets optimized out to 0, while (x >> 32) # gets optimized out to a nop). # # Fix for starfire below # -------------------------------------------- # 03/10/29 achirica@telefonica.net 1.1380.2.20 # [PATCH] Fix wireless stats locking # -------------------------------------------- # 03/10/29 akpm@osdl.org 1.1380.2.21 # [netdrvr 3c527] add MODULE_LICENSE tag # -------------------------------------------- # 03/10/29 torvalds@home.osdl.org 1.1380.2.22 # Merge bk://bk.arm.linux.org.uk/linux-2.6-pcmcia # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/29 torvalds@home.osdl.org 1.1380.1.3 # Merge bk://bk.arm.linux.org.uk/linux-2.6-serial # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/29 shaggy@shaggy.austin.ibm.com 1.1380.4.1 # JFS: remove racy, redundant call to block_invalidatepage # # __invalidate_metapages references mp->page after after releasing the # meta_lock spinlock, without increasing the use count. This is racy and # unnecessary since setting the META_discard flag is sufficient. # block_invalidatepage() will be called when the metapage is released. # -------------------------------------------- # 03/10/29 davem@nuts.ninka.net 1.1391 # [NET]: Make skb_copy_expand() copy header area too. # -------------------------------------------- # 03/10/29 torvalds@home.osdl.org 1.1380.1.4 # Merge http://jfs.bkbits.net/linux-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/29 ink@jurassic.park.msu.ru 1.1380.5.1 # [PATCH] PCI: fix bug in pci_setup_bridge() # # This bug prevents Alphas with older firmware from booting if there # is a card with PCI-PCI bridge that supports 32-bit IO. # This has happened on AS2100 with a quad-tulip card, for example: # - initially, the I/O window of 21152 bridge was 0x10000-0x10fff, # as set up by firmware; # - pci_setup_bridge() is going to change this, say, to 0xa000-0xafff: # first, it updates PCI_IO_BASE_UPPER16 and PCI_IO_LIMIT_UPPER16 # registers, so that IO window temporarily is at 0x0000-0x0fff, # which effectively blocks up all legacy IO ports in the lower # 4K range, such as serial, floppy, RTC an so on; # does debugging printk - machine dies here with recursive # machine checks as the serial console has gone. # # Moving (or disabling) the debugging printk is not a solution - # there is possibility that timer interrupt (which might access RTC # ports) occurs between writes to lower and upper parts of the # base/limit registers. # # The patch temporarily disables the IO window of the bridge by # setting PCI_IO_BASE_UPPER16 > PCI_IO_LIMIT_UPPER16 before doing # an update. It's safe, as we don't have any active IO behind # the bridge at this point. Also, it's a NOP for bridges with # 16-bit-only IO. # Similar (but simpler, as we always clear upper 32 bits) fix # for 64-bit prefetchable MMIO range. # -------------------------------------------- # 03/10/29 greg@kroah.com 1.1380.5.2 # [PATCH] I2C: remove some MOD_INC and MOD_DEC usages that are not needed anymore. # -------------------------------------------- # 03/10/29 greg@kroah.com 1.1380.5.3 # [PATCH] USB: don't build the whiteheat driver if on SMP as the locking is all messed up. # -------------------------------------------- # 03/10/29 torvalds@home.osdl.org 1.1380.1.5 # Merge bk://kernel.bkbits.net/gregkh/linux/pci-2.6 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/29 rusty@rustcorp.com.au 1.1380.1.6 # [PATCH] Fix for module initialization failure # # Bug reported by Paul Mackerras: if a module parameter fails, we didn't # call module_arch_cleanup(). # # On x86 this was harmless (module_arch_cleanup() is a no-op), but on # other architectures like PPC this causes inconsistent data structures # and subsequent oopses. # -------------------------------------------- # 03/10/30 shaggy@shaggy.austin.ibm.com 1.1380.4.2 # JFS: Fix race between link() and unlink() # # JFS isn't happy it thinks a file has been removed, and link() increases # its nlink count back from zero. In 2.4, i_zombie prevented this race # condition. # # http://bugzilla.kernel.org/show_bug.cgi?id=866 # -------------------------------------------- # 03/10/30 torvalds@home.osdl.org 1.1380.1.7 # Merge http://jfs.bkbits.net/linux-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/30 yoshfuji@linux-ipv6.org 1.1392 # [CRYPTO]: crypto_alg_lookup() should fail when passed a NULL name. # -------------------------------------------- # 03/10/30 herbert@gondor.apana.org.au 1.1393 # [IPSEC]: Missing NULL algorithm checks in AH and IPCOMP init. # -------------------------------------------- # 03/10/30 acme@conectiva.com.br 1.1394 # [LLC]: Fix sockaddr, only need to provide one MAC address not three. # -------------------------------------------- # 03/10/30 davem@kernel.bkbits.net 1.1380.6.1 # Merge davem@nuts.ninka.net:/disk1/davem/BK/sparc-2.5 # into kernel.bkbits.net:/home/davem/sparc-2.5 # -------------------------------------------- # 03/10/30 davem@nuts.ninka.net 1.1395 # Merge nuts.ninka.net:/disk1/davem/BK/network-2.5 # into nuts.ninka.net:/disk1/davem/BK/net-2.5 # -------------------------------------------- # 03/10/30 torvalds@home.osdl.org 1.1380.1.8 # Merge bk://kernel.bkbits.net/davem/sparc-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/30 torvalds@home.osdl.org 1.1380.1.9 # Stop SIS 96x chips from lying about themselves. # # Some machines with the SIS 96x southbridge have it set up # to claim it is a SIS 503 chip. That breaks irq routing logic # among other things. Fix it properly by making everybody aware # of the duplicity. # -------------------------------------------- # 03/10/30 torvalds@home.osdl.org 1.1396 # Merge bk://kernel.bkbits.net/davem/net-2.5 # into home.osdl.org:/home/torvalds/v2.5/linux # -------------------------------------------- # 03/10/31 mjc@smp.uni.325i.org 1.1397 # Merge bk://linux.bkbits.net/linux-2.5 # into smp.uni.325i.org:/usr/src/linux-2.6 # -------------------------------------------- # diff -Nru a/Documentation/networking/irda.txt b/Documentation/networking/irda.txt --- a/Documentation/networking/irda.txt Fri Oct 31 14:10:53 2003 +++ b/Documentation/networking/irda.txt Fri Oct 31 14:10:53 2003 @@ -3,12 +3,12 @@ programs can be found on http://irda.sourceforge.net/ For more information about how to use the IrDA protocol stack, see the -IR-HOWTO (http://www.mobilix.org/Infrared-HOWTO/Infrared-HOWTO.html) written by Werner Heuser - +Linux Infared HOWTO (http://www.tuxmobil.org/Infrared-HOWTO/Infrared-HOWTO.html) +by Werner Heuser There is an active mailing list for discussing Linux-IrDA matters called -linux-irda. To subscribe to it, visit: + irda-users@lists.sourceforge.net + + - http://www.pasta.cs.uit.no/mailman/listinfo/linux-irda -Dag Brattli diff -Nru a/Documentation/video4linux/meye.txt b/Documentation/video4linux/meye.txt --- a/Documentation/video4linux/meye.txt Fri Oct 31 14:10:54 2003 +++ b/Documentation/video4linux/meye.txt Fri Oct 31 14:10:54 2003 @@ -33,6 +33,11 @@ driver however), but things are not moving very fast (see http://r-engine.sourceforge.net/) (PCI vendor/device is 0x10cf/0x2011). +There is a forth model connected on the USB bus in TR1* Vaio laptops. +This camera is not supported at all by the current driver, in fact +little information if any is available for this camera +(USB vendor/device is 0x054c/0x0107). + Driver options: --------------- diff -Nru a/MAINTAINERS b/MAINTAINERS --- a/MAINTAINERS Fri Oct 31 14:10:54 2003 +++ b/MAINTAINERS Fri Oct 31 14:10:54 2003 @@ -1356,7 +1356,7 @@ L: linux-scsi@vger.kernel.org S: Maintained -NETFILTER/IPTABLES +NETFILTER/IPTABLES/IPCHAINS P: Rusty Russell P: Marc Boucher P: James Morris diff -Nru a/Makefile b/Makefile --- a/Makefile Fri Oct 31 14:10:54 2003 +++ b/Makefile Fri Oct 31 14:10:54 2003 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -test8 +EXTRAVERSION = -test9 # *DOCUMENTATION* # To see a list of typical targets execute "make help" @@ -805,7 +805,7 @@ # --------------------------------------------------------------------------- define all-sources - ( find . $(RCS_FIND_IGNORE) \ + ( find init kernel mm fs ipc lib drivers/block security arch -follow $(RCS_FIND_IGNORE) -name ulevel -prune -o \ \( -name include -o -name arch \) -prune -o \ -name '*.[chS]' -print; \ find arch/$(ARCH) $(RCS_FIND_IGNORE) \ diff -Nru a/Makefile~all-sources.diff b/Makefile~all-sources.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/Makefile~all-sources.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,1029 @@ +VERSION = 2 +PATCHLEVEL = 6 +SUBLEVEL = 0 +EXTRAVERSION = -test7 + +# *DOCUMENTATION* +# To see a list of typical targets execute "make help" +# More info can be located in ./README +# Comments in this file are targeted only to the developer, do not +# expect to learn how to build the kernel reading this file. + +# Do not print "Entering directory ..." +MAKEFLAGS += --no-print-directory + +# We are using a recursive build, so we need to do a little thinking +# to get the ordering right. +# +# Most importantly: sub-Makefiles should only ever modify files in +# their own directory. If in some directory we have a dependency on +# a file in another dir (which doesn't happen often, but it's of +# unavoidable when linking the built-in.o targets which finally +# turn into vmlinux), we will call a sub make in that other dir, and +# after that we are sure that everything which is in that other dir +# is now up to date. +# +# The only cases where we need to modify files which have global +# effects are thus separated out and done before the recursive +# descending is started. They are now explicitly listed as the +# prepare rule. + +# To put more focus on warnings, be less verbose as default +# Use 'make V=1' to see the full commands + +ifdef V + ifeq ("$(origin V)", "command line") + KBUILD_VERBOSE = $(V) + endif +endif +ifndef KBUILD_VERBOSE + KBUILD_VERBOSE = 0 +endif + +# Call sparse as part of compilation of C files +# Use 'make C=1' to enable sparse checking + +ifdef C + ifeq ("$(origin C)", "command line") + KBUILD_CHECKSRC = $(C) + endif +endif +ifndef KBUILD_CHECKSRC + KBUILD_CHECKSRC = 0 +endif + +# kbuild supports saving output files in a separate directory. +# To locate output files in a separate directory two syntax'es are supported. +# In both cases the working directory must be the root of the kernel src. +# 1) O= +# Use "make O=dir/to/store/output/files/" +# +# 2) Set KBUILD_OUTPUT +# Set the environment variable KBUILD_OUTPUT to point to the directory +# where the output files shall be placed. +# export KBUILD_OUTPUT=dir/to/store/output/files/ +# make +# +# The O= assigment takes precedence over the KBUILD_OUTPUT environment variable. + + +# KBUILD_SRC is set on invocation of make in OBJ directory +# KBUILD_SRC is not intended to be used by the regular user (for now) +ifeq ($(KBUILD_SRC),) + +# OK, Make called in directory where kernel src resides +# Do we want to locate output files in a separate directory? +ifdef O + ifeq ("$(origin O)", "command line") + KBUILD_OUTPUT := $(O) + endif +endif + +# That's our default target when none is given on the command line +.PHONY: all +all: + +ifneq ($(KBUILD_OUTPUT),) +# Invoke a second make in the output directory, passing relevant variables +# check that the output directory actually exists +saved-output := $(KBUILD_OUTPUT) +KBUILD_OUTPUT := $(shell cd $(KBUILD_OUTPUT) && /bin/pwd) +$(if $(wildcard $(KBUILD_OUTPUT)),, \ + $(error output directory "$(saved-output)" does not exist)) + +.PHONY: $(MAKECMDGOALS) + +$(filter-out all,$(MAKECMDGOALS)) all: + $(if $(KBUILD_VERBOSE:1=),@)$(MAKE) -C $(KBUILD_OUTPUT) \ + KBUILD_SRC=$(CURDIR) KBUILD_VERBOSE=$(KBUILD_VERBOSE) \ + KBUILD_CHECK=$(KBUILD_CHECK) -f $(CURDIR)/Makefile $@ + +# Leave processing to above invocation of make +skip-makefile := 1 +endif # ifneq ($(KBUILD_OUTPUT),) +endif # ifeq ($(KBUILD_SRC),) + +# We process the rest of the Makefile if this is the final invocation of make +ifeq ($(skip-makefile),) + +srctree := $(if $(KBUILD_SRC),$(KBUILD_SRC),$(CURDIR)) +TOPDIR := $(srctree) +# FIXME - TOPDIR is obsolete, use srctree/objtree +objtree := $(CURDIR) +src := $(srctree) +obj := $(objtree) + +VPATH := $(srctree) + +export srctree objtree VPATH TOPDIR + +KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) + +# SUBARCH tells the usermode build what the underlying arch is. That is set +# first, and if a usermode build is happening, the "ARCH=um" on the command +# line overrides the setting of ARCH below. If a native build is happening, +# then ARCH is assigned, getting whatever value it gets normally, and +# SUBARCH is subsequently ignored. + +SUBARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \ + -e s/arm.*/arm/ -e s/sa110/arm/ \ + -e s/s390x/s390/ -e s/parisc64/parisc/ ) + +# Cross compiling and selecting different set of gcc/bin-utils +# --------------------------------------------------------------------------- +# +# When performing cross compilation for other architectures ARCH shall be set +# to the target architecture. (See arch/* for the possibilities). +# ARCH can be set during invocation of make: +# make ARCH=ia64 +# Another way is to have ARCH set in the environment. +# The default ARCH is the host where make is executed. + +# CROSS_COMPILE specify the prefix used for all executables used +# during compilation. Only gcc and related bin-utils executables +# are prefixed with $(CROSS_COMPILE). +# CROSS_COMPILE can be set on the command line +# make CROSS_COMPILE=ia64-linux- +# Alternatively CROSS_COMPILE can be set in the environment. +# Default value for CROSS_COMPILE is not to prefix executables +# Note: Some architectures assign CROSS_COMPILE in their arch/*/Makefile + +ARCH ?= $(SUBARCH) +CROSS_COMPILE ?= + +# Architecture as present in compile.h +UTS_MACHINE := $(ARCH) + +# SHELL used by kbuild +CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ + else if [ -x /bin/bash ]; then echo /bin/bash; \ + else echo sh; fi ; fi) + +HOSTCC = gcc +HOSTCXX = g++ +HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer +HOSTCXXFLAGS = -O2 + +# Decide whether to build built-in, modular, or both. +# Normally, just do built-in. + +KBUILD_MODULES := +KBUILD_BUILTIN := 1 + +# If we have only "make modules", don't compile built-in objects. +# When we're building modules with modversions, we need to consider +# the built-in objects during the descend as well, in order to +# make sure the checksums are uptodate before we record them. + +ifeq ($(MAKECMDGOALS),modules) + KBUILD_BUILTIN := $(if $(CONFIG_MODVERSIONS),1) +endif + +# If we have "make modules", compile modules +# in addition to whatever we do anyway. +# Just "make" or "make all" shall build modules as well + +ifneq ($(filter all modules,$(MAKECMDGOALS)),) + KBUILD_MODULES := 1 +endif + +ifeq ($(MAKECMDGOALS),) + KBUILD_MODULES := 1 +endif + +export KBUILD_MODULES KBUILD_BUILTIN KBUILD_VERBOSE +export KBUILD_CHECKSRC KBUILD_SRC + +# Beautify output +# --------------------------------------------------------------------------- +# +# Normally, we echo the whole command before executing it. By making +# that echo $($(quiet)$(cmd)), we now have the possibility to set +# $(quiet) to choose other forms of output instead, e.g. +# +# quiet_cmd_cc_o_c = Compiling $(RELDIR)/$@ +# cmd_cc_o_c = $(CC) $(c_flags) -c -o $@ $< +# +# If $(quiet) is empty, the whole command will be printed. +# If it is set to "quiet_", only the short version will be printed. +# If it is set to "silent_", nothing wil be printed at all, since +# the variable $(silent_cmd_cc_o_c) doesn't exist. +# +# A simple variant is to prefix commands with $(Q) - that's usefull +# for commands that shall be hidden in non-verbose mode. +# +# $(Q)ln $@ :< +# +# If KBUILD_VERBOSE equals 0 then the above command will be hidden. +# If KBUILD_VERBOSE equals 1 then the above command is displayed. + +ifeq ($(KBUILD_VERBOSE),1) + quiet = + Q = +else + quiet=quiet_ + Q = @ +endif + +# If the user is running make -s (silent mode), suppress echoing of +# commands + +ifneq ($(findstring s,$(MAKEFLAGS)),) + quiet=silent_ +endif + +check_gcc = $(shell if $(CC) $(CFLAGS) $(1) -S -o /dev/null -xc /dev/null > /dev/null 2>&1; then echo "$(1)"; else echo "$(2)"; fi ;) + +export quiet Q KBUILD_VERBOSE check_gcc + +# Look for make include files relative to root of kernel src +MAKEFLAGS += --include-dir=$(srctree) + +# For maximum performance (+ possibly random breakage, uncomment +# the following) + +#MAKEFLAGS += -rR + +# Make variables (CC, etc...) + +AS = $(CROSS_COMPILE)as +LD = $(CROSS_COMPILE)ld +CC = $(CROSS_COMPILE)gcc +CPP = $(CC) -E +AR = $(CROSS_COMPILE)ar +NM = $(CROSS_COMPILE)nm +STRIP = $(CROSS_COMPILE)strip +OBJCOPY = $(CROSS_COMPILE)objcopy +OBJDUMP = $(CROSS_COMPILE)objdump +AWK = awk +RPM := $(shell if [ -x "/usr/bin/rpmbuild" ]; then echo rpmbuild; \ + else echo rpm; fi) +GENKSYMS = scripts/genksyms/genksyms +DEPMOD = /sbin/depmod +KALLSYMS = scripts/kallsyms +PERL = perl +CHECK = sparse +MODFLAGS = -DMODULE +CFLAGS_MODULE = $(MODFLAGS) +AFLAGS_MODULE = $(MODFLAGS) +LDFLAGS_MODULE = -r +CFLAGS_KERNEL = +AFLAGS_KERNEL = + +NOSTDINC_FLAGS = -nostdinc -iwithprefix include + +CPPFLAGS := -D__KERNEL__ -Iinclude \ + $(if $(KBUILD_SRC),-Iinclude2 -I$(srctree)/include) + +CFLAGS := -Wall -Wstrict-prototypes -Wno-trigraphs -O2 \ + -fno-strict-aliasing -fno-common +AFLAGS := -D__ASSEMBLY__ + +export VERSION PATCHLEVEL SUBLEVEL EXTRAVERSION KERNELRELEASE ARCH \ + CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC \ + CPP AR NM STRIP OBJCOPY OBJDUMP MAKE AWK GENKSYMS PERL UTS_MACHINE \ + HOSTCXX HOSTCXXFLAGS LDFLAGS_BLOB LDFLAGS_MODULE CHECK + +export CPPFLAGS NOSTDINC_FLAGS OBJCOPYFLAGS LDFLAGS +export CFLAGS CFLAGS_KERNEL CFLAGS_MODULE +export AFLAGS AFLAGS_KERNEL AFLAGS_MODULE + +export MODVERDIR := .tmp_versions + +# The temporary file to save gcc -MD generated dependencies must not +# contain a comma +comma := , +depfile = $(subst $(comma),_,$(@D)/.$(@F).d) + +# Files to ignore in find ... statements + +RCS_FIND_IGNORE := \( -name SCCS -o -name BitKeeper -o -name .svn -o -name CVS \) -prune -o +RCS_TAR_IGNORE := --exclude SCCS --exclude BitKeeper --exclude .svn --exclude CVS + +# =========================================================================== +# Rules shared between *config targets and build targets + +# Helpers built in scripts/ + +scripts/docproc scripts/split-include : scripts ; + +.PHONY: scripts scripts/fixdep +scripts: + $(Q)$(MAKE) $(build)=scripts + +scripts/fixdep: + $(Q)$(MAKE) $(build)=scripts $@ + + +# To make sure we do not include .config for any of the *config targets +# catch them early, and hand them over to scripts/kconfig/Makefile +# It is allowed to specify more targets when calling make, including +# mixing *config targets and build targets. +# For example 'make oldconfig all'. +# Detect when mixed targets is specified, and make a second invocation +# of make so .config is not included in this case either (for *config). + +no-dot-config-targets := clean mrproper distclean \ + cscope TAGS tags help %docs check% + +config-targets := 0 +mixed-targets := 0 +dot-config := 1 + +ifneq ($(filter $(no-dot-config-targets), $(MAKECMDGOALS)),) + ifeq ($(filter-out $(no-dot-config-targets), $(MAKECMDGOALS)),) + dot-config := 0 + endif +endif + +ifneq ($(filter config %config,$(MAKECMDGOALS)),) + config-targets := 1 + ifneq ($(filter-out config %config,$(MAKECMDGOALS)),) + mixed-targets := 1 + endif +endif + +ifeq ($(mixed-targets),1) +# =========================================================================== +# We're called with mixed targets (*config and build targets). +# Handle them one by one. + +%:: FORCE + $(Q)$(MAKE) -C $(srctree) KBUILD_SRC= $@ + +else +ifeq ($(config-targets),1) +# =========================================================================== +# *config targets only - make sure prerequisites are updated, and descend +# in scripts/kconfig to make the *config target + +%config: scripts/fixdep FORCE + $(Q)$(MAKE) $(build)=scripts/kconfig $@ +config : scripts/fixdep FORCE + $(Q)$(MAKE) $(build)=scripts/kconfig $@ + +else +# =========================================================================== +# Build targets only - this includes vmlinux, arch specific targets, clean +# targets and others. In general all targets except *config targets. + +# That's our default target when none is given on the command line +# Note that 'modules' will be added as a prerequisite as well, +# in the CONFIG_MODULES part below + +all: vmlinux + +# Objects we will link into vmlinux / subdirs we need to visit +init-y := init/ +drivers-y := drivers/ sound/ +net-y := net/ +libs-y := lib/ +core-y := usr/ +SUBDIRS := + +ifeq ($(dot-config),1) +# In this section, we need .config + +# Read in dependencies to all Kconfig* files, make sure to run +# oldconfig if changes are detected. +-include .config.cmd + +include .config + +# If .config needs to be updated, it will be done via the dependency +# that autoconf has on .config. +# To avoid any implicit rule to kick in, define an empty command +.config: ; + +# If .config is newer than include/linux/autoconf.h, someone tinkered +# with it and forgot to run make oldconfig +include/linux/autoconf.h: .config + $(Q)$(MAKE) -f $(srctree)/Makefile silentoldconfig + +endif + +include $(srctree)/arch/$(ARCH)/Makefile + +# Let architecture Makefiles change CPPFLAGS if needed +CFLAGS := $(CPPFLAGS) $(CFLAGS) +AFLAGS := $(CPPFLAGS) $(AFLAGS) + +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ + +SUBDIRS += $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ + $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ + $(net-y) $(net-m) $(libs-y) $(libs-m))) + +ALL_SUBDIRS := $(sort $(SUBDIRS) $(patsubst %/,%,$(filter %/, \ + $(init-n) $(init-) \ + $(core-n) $(core-) $(drivers-n) $(drivers-) \ + $(net-n) $(net-) $(libs-n) $(libs-)))) + +init-y := $(patsubst %/, %/built-in.o, $(init-y)) +core-y := $(patsubst %/, %/built-in.o, $(core-y)) +drivers-y := $(patsubst %/, %/built-in.o, $(drivers-y)) +net-y := $(patsubst %/, %/built-in.o, $(net-y)) +libs-y1 := $(patsubst %/, %/lib.a, $(libs-y)) +libs-y2 := $(patsubst %/, %/built-in.o, $(libs-y)) +libs-y := $(libs-y1) $(libs-y2) + +# Here goes the main Makefile +# --------------------------------------------------------------------------- + + +ifndef CONFIG_FRAME_POINTER +CFLAGS += -fomit-frame-pointer +endif + +ifdef CONFIG_DEBUG_INFO +CFLAGS += -g +endif + +# warn about C99 declaration after statement +CFLAGS += $(call check_gcc,-Wdeclaration-after-statement,) + +# +# INSTALL_PATH specifies where to place the updated kernel and system map +# images. Uncomment if you want to place them anywhere other than root. +# + +#export INSTALL_PATH=/boot + +# +# INSTALL_MOD_PATH specifies a prefix to MODLIB for module directory +# relocations required by build roots. This is not defined in the +# makefile but the arguement can be passed to make if needed. +# + +MODLIB := $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE) +export MODLIB + +# Build vmlinux +# --------------------------------------------------------------------------- + +# This is a bit tricky: If we need to relink vmlinux, we want +# the version number incremented, which means recompile init/version.o +# and relink init/init.o. However, we cannot do this during the +# normal descending-into-subdirs phase, since at that time +# we cannot yet know if we will need to relink vmlinux. +# So we descend into init/ inside the rule for vmlinux again. +head-y += $(HEAD) +vmlinux-objs := $(head-y) $(init-y) $(core-y) $(libs-y) $(drivers-y) $(net-y) + +quiet_cmd_vmlinux__ = LD $@ +define cmd_vmlinux__ + $(LD) $(LDFLAGS) $(LDFLAGS_vmlinux) $(head-y) $(init-y) \ + --start-group \ + $(core-y) \ + $(libs-y) \ + $(drivers-y) \ + $(net-y) \ + --end-group \ + $(filter .tmp_kallsyms%,$^) \ + -o $@ +endef + +# set -e makes the rule exit immediately on error + +define rule_vmlinux__ + +set -e; \ + $(if $(filter .tmp_kallsyms%,$^),, \ + echo ' GEN .version'; \ + . $(srctree)/scripts/mkversion > .tmp_version; \ + mv -f .tmp_version .version; \ + $(MAKE) $(build)=init; \ + ) \ + $(if $($(quiet)cmd_vmlinux__), \ + echo ' $($(quiet)cmd_vmlinux__)' &&) \ + $(cmd_vmlinux__); \ + echo 'cmd_$@ := $(cmd_vmlinux__)' > $(@D)/.$(@F).cmd +endef + +define rule_vmlinux + $(rule_vmlinux__); \ + $(NM) $@ | grep -v '\(compiled\)\|\(\.o$$\)\|\( [aUw] \)\|\(\.\.ng$$\)\|\(LASH[RL]DI\)' | sort > System.map +endef + +LDFLAGS_vmlinux += -T arch/$(ARCH)/kernel/vmlinux.lds.s + +# Generate section listing all symbols and add it into vmlinux +# It's a three stage process: +# o .tmp_vmlinux1 has all symbols and sections, but __kallsyms is +# empty +# Running kallsyms on that gives us .tmp_kallsyms1.o with +# the right size +# o .tmp_vmlinux2 now has a __kallsyms section of the right size, +# but due to the added section, some addresses have shifted +# From here, we generate a correct .tmp_kallsyms2.o +# o The correct .tmp_kallsyms2.o is linked into the final vmlinux. + +ifdef CONFIG_KALLSYMS + +kallsyms.o := .tmp_kallsyms2.o + +quiet_cmd_kallsyms = KSYM $@ +cmd_kallsyms = $(NM) -n $< | $(KALLSYMS) > $@ + +.tmp_kallsyms1.o .tmp_kallsyms2.o: %.o: %.S scripts FORCE + $(call if_changed_dep,as_o_S) + +.tmp_kallsyms%.S: .tmp_vmlinux% + $(call cmd,kallsyms) + +.tmp_vmlinux1: $(vmlinux-objs) arch/$(ARCH)/kernel/vmlinux.lds.s FORCE + +$(call if_changed_rule,vmlinux__) + +.tmp_vmlinux2: $(vmlinux-objs) .tmp_kallsyms1.o arch/$(ARCH)/kernel/vmlinux.lds.s FORCE + $(call if_changed_rule,vmlinux__) + +endif + +# Finally the vmlinux rule + +vmlinux: $(vmlinux-objs) $(kallsyms.o) arch/$(ARCH)/kernel/vmlinux.lds.s FORCE + $(call if_changed_rule,vmlinux) + +# The actual objects are generated when descending, +# make sure no implicit rule kicks in + +$(sort $(vmlinux-objs)) arch/$(ARCH)/kernel/vmlinux.lds.s: $(SUBDIRS) ; + +# Handle descending into subdirectories listed in $(SUBDIRS) + +.PHONY: $(SUBDIRS) +$(SUBDIRS): prepare-all + $(Q)$(MAKE) $(build)=$@ + +# Things we need to do before we recursively start building the kernel +# or the modules are listed in "prepare-all". +# A multi level approach is used. prepare1 is updated first, then prepare0. +# prepare-all is the collection point for the prepare targets. + +.PHONY: prepare-all prepare prepare0 prepare1 + +# prepare1 is used to check if we are building in a separate output directory, +# and if so do: +# 1) Check that make has not been executed in the kernel src $(srctree) +# 2) Create the include2 directory, used for the second asm symlink + +prepare1: +ifneq ($(KBUILD_SRC),) + @echo ' Using $(srctree) as source for kernel' + $(Q)if [ -h $(srctree)/include/asm -o -f $(srctree)/.config ]; then \ + echo " $(srctree) is not clean, please run 'make mrproper'";\ + echo " in the '$(srctree)' directory.";\ + /bin/false; \ + fi; + $(Q)if [ ! -d include2 ]; then mkdir -p include2; fi; + $(Q)ln -fsn $(srctree)/include/asm-$(ARCH) include2/asm +endif + +prepare0: prepare1 include/linux/version.h include/asm include/config/MARKER +ifdef KBUILD_MODULES +ifeq ($(origin SUBDIRS),file) + $(Q)rm -rf $(MODVERDIR) +else + @echo '*** Warning: Overriding SUBDIRS on the command line can cause' + @echo '*** inconsistencies' +endif +endif + $(if $(CONFIG_MODULES),$(Q)mkdir -p $(MODVERDIR)) + +# All the preparing.. +prepare-all: prepare0 prepare + +# Leave this as default for preprocessing vmlinux.lds.S, which is now +# done in arch/$(ARCH)/kernel/Makefile + +export AFLAGS_vmlinux.lds.o += -P -C -U$(ARCH) + +# Single targets +# --------------------------------------------------------------------------- + +%.s: %.c scripts FORCE + $(Q)$(MAKE) $(build)=$(@D) $@ +%.i: %.c scripts FORCE + $(Q)$(MAKE) $(build)=$(@D) $@ +%.o: %.c scripts FORCE + $(Q)$(MAKE) $(build)=$(@D) $@ +%/: scripts prepare FORCE + $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) $(build)=$(@D) +%.lst: %.c scripts FORCE + $(Q)$(MAKE) $(build)=$(@D) $@ +%.s: %.S scripts FORCE + $(Q)$(MAKE) $(build)=$(@D) $@ +%.o: %.S scripts FORCE + $(Q)$(MAKE) $(build)=$(@D) $@ + +# FIXME: The asm symlink changes when $(ARCH) changes. That's +# hard to detect, but I suppose "make mrproper" is a good idea +# before switching between archs anyway. + +include/asm: + @echo ' SYMLINK $@ -> include/asm-$(ARCH)' + $(Q)if [ ! -d include ]; then mkdir -p include; fi; + @ln -fsn asm-$(ARCH) $@ + +# Split autoconf.h into include/linux/config/* + +include/config/MARKER: scripts/split-include include/linux/autoconf.h + @echo ' SPLIT include/linux/autoconf.h -> include/config/*' + @scripts/split-include include/linux/autoconf.h include/config + @touch $@ + +# Generate some files +# --------------------------------------------------------------------------- + +# version.h changes when $(KERNELRELEASE) etc change, as defined in +# this Makefile + +uts_len := 64 + +define filechk_version.h + if expr length "$(KERNELRELEASE)" \> $(uts_len) >/dev/null ; then \ + echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2; \ + exit 1; \ + fi; \ + (echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; \ + echo \#define LINUX_VERSION_CODE `expr $(VERSION) \\* 65536 + $(PATCHLEVEL) \\* 256 + $(SUBLEVEL)`; \ + echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))'; \ + ) +endef + +include/linux/version.h: Makefile + $(call filechk,version.h) + +# --------------------------------------------------------------------------- + +.PHONY: depend dep +depend dep: + @echo '*** Warning: make $@ is unnecessary now.' + +# --------------------------------------------------------------------------- +# Modules + +ifdef CONFIG_MODULES + +# By default, build modules as well + +all: modules + +# Build modules + +.PHONY: modules +modules: $(SUBDIRS) $(if $(KBUILD_BUILTIN),vmlinux) + @echo ' Building modules, stage 2.'; + $(Q)$(MAKE) -rR -f $(srctree)/scripts/Makefile.modpost + +# Install modules + +.PHONY: modules_install +modules_install: _modinst_ _modinst_post + +.PHONY: _modinst_ +_modinst_: + @if [ -z "`$(DEPMOD) -V | grep module-init-tools`" ]; then \ + echo "Warning: you may need to install module-init-tools"; \ + echo "See http://www.codemonkey.org.uk/post-halloween-2.5.txt";\ + sleep 1; \ + fi + @rm -rf $(MODLIB)/kernel + @rm -f $(MODLIB)/build + @mkdir -p $(MODLIB)/kernel + @ln -s $(TOPDIR) $(MODLIB)/build + $(Q)$(MAKE) -rR -f $(srctree)/scripts/Makefile.modinst + +# If System.map exists, run depmod. This deliberately does not have a +# dependency on System.map since that would run the dependency tree on +# vmlinux. This depmod is only for convenience to give the initial +# boot a modules.dep even before / is mounted read-write. However the +# boot script depmod is the master version. +ifeq "$(strip $(INSTALL_MOD_PATH))" "" +depmod_opts := +else +depmod_opts := -b $(INSTALL_MOD_PATH) -r +endif +.PHONY: _modinst_post +_modinst_post: _modinst_ + if [ -r System.map ]; then $(DEPMOD) -ae -F System.map $(depmod_opts) $(KERNELRELEASE); fi + +else # CONFIG_MODULES + +# Modules not configured +# --------------------------------------------------------------------------- + +modules modules_install: FORCE + @echo + @echo "The present kernel configuration has modules disabled." + @echo "Type 'make config' and enable loadable module support." + @echo "Then build a kernel with module support enabled." + @echo + @exit 1 + +endif # CONFIG_MODULES + +# Generate asm-offsets.h +# --------------------------------------------------------------------------- + +define filechk_gen-asm-offsets + (set -e; \ + echo "#ifndef __ASM_OFFSETS_H__"; \ + echo "#define __ASM_OFFSETS_H__"; \ + echo "/*"; \ + echo " * DO NOT MODIFY."; \ + echo " *"; \ + echo " * This file was generated by arch/$(ARCH)/Makefile"; \ + echo " *"; \ + echo " */"; \ + echo ""; \ + sed -ne "/^->/{s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; s:->::; p;}"; \ + echo ""; \ + echo "#endif" ) +endef + + +### +# Cleaning is done on three levels. +# make clean Delete all automatically generated files, including +# tools and firmware. +# make mrproper Delete the current configuration, and related files +# Any core files spread around are deleted as well +# make distclean Remove editor backup files, patch leftover files and the like + +# Files removed with 'make clean' +CLEAN_FILES += vmlinux System.map MC* + +# Files removed with 'make mrproper' +MRPROPER_FILES += \ + include/linux/autoconf.h include/linux/version.h \ + .version .config .config.old config.in config.old \ + .menuconfig.log \ + include/asm \ + .hdepend include/linux/modversions.h \ + tags TAGS cscope* kernel.spec \ + .tmp* + +# Directories removed with 'make mrproper' +MRPROPER_DIRS += \ + $(MODVERDIR) \ + .tmp_export-objs \ + include/config \ + include/linux/modules \ + include2 + +# clean - Delete all intermediate files +# +clean-dirs += $(addprefix _clean_,$(ALL_SUBDIRS) Documentation/DocBook scripts) +.PHONY: $(clean-dirs) clean archclean mrproper archmrproper distclean +$(clean-dirs): + $(Q)$(MAKE) $(clean)=$(patsubst _clean_%,%,$@) + +quiet_cmd_rmclean = RM $$(CLEAN_FILES) +cmd_rmclean = rm -f $(CLEAN_FILES) +clean: archclean $(clean-dirs) + $(call cmd,rmclean) + @find . $(RCS_FIND_IGNORE) \ + \( -name '*.[oas]' -o -name '*.ko' -o -name '.*.cmd' \ + -o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \) \ + -type f -print | xargs rm -f + +# mrproper - delete configuration + modules + core files +# +quiet_cmd_mrproper = RM $$(MRPROPER_DIRS) + $$(MRPROPER_FILES) +cmd_mrproper = rm -rf $(MRPROPER_DIRS) && rm -f $(MRPROPER_FILES) +mrproper distclean: clean archmrproper + @echo ' Making $@ in the srctree' + @find . $(RCS_FIND_IGNORE) \ + \( -name '*.orig' -o -name '*.rej' -o -name '*~' \ + -o -name '*.bak' -o -name '#*#' -o -name '.*.orig' \ + -o -name '.*.rej' -o -size 0 \ + -o -name '*%' -o -name '.*.cmd' -o -name 'core' \) \ + -type f -print | xargs rm -f + $(call cmd,mrproper) + +# Generate tags for editors +# --------------------------------------------------------------------------- + +define all-sources + ( find . $(RCS_FIND_IGNORE) \ + \( -name include -o -name arch \) -prune -o \ + -name '*.[chS]' -print; \ + find arch/$(ARCH) $(RCS_FIND_IGNORE) \ + -name '*.[chS]' -print; \ + find include $(RCS_FIND_IGNORE) \ + \( -name config -o -name 'asm-*' \) -prune \ + -o -name '*.[chS]' -print; \ + find include/asm-$(ARCH) $(RCS_FIND_IGNORE) \ + -name '*.[chS]' -print; \ + find include/asm-generic $(RCS_FIND_IGNORE) \ + -name '*.[chS]' -print ) +endef + +quiet_cmd_cscope = MAKE $@ +cmd_cscope = $(all-sources) | cscope -k -b -i - + +quiet_cmd_TAGS = MAKE $@ +cmd_TAGS = $(all-sources) | etags - + +# Exuberant ctags works better with -I + +quiet_cmd_tags = MAKE $@ +define cmd_tags + rm -f $@; \ + CTAGSF=`ctags --version | grep -i exuberant >/dev/null && echo "-I __initdata,__exitdata,EXPORT_SYMBOL,EXPORT_SYMBOL_NOVERS"`; \ + $(all-sources) | xargs ctags $$CTAGSF -a +endef + +cscope: FORCE + $(call cmd,cscope) + +TAGS: FORCE + $(call cmd,TAGS) + +tags: FORCE + $(call cmd,tags) + +# RPM target +# --------------------------------------------------------------------------- + +.PHONY: rpm + +# Remove hyphens since they have special meaning in RPM filenames +KERNELPATH=kernel-$(subst -,,$(KERNELRELEASE)) + +# If you do a make spec before packing the tarball you can rpm -ta it + +spec: + $(CONFIG_SHELL) $(srctree)/scripts/mkspec > $(objtree)/kernel.spec + +# a) Build a tar ball +# b) generate an rpm from it +# c) and pack the result +# - Use /. to avoid tar packing just the symlink + +rpm: clean spec + set -e; \ + cd .. ; \ + ln -sf $(srctree) $(KERNELPATH) ; \ + tar -cvz $(RCS_TAR_IGNORE) -f $(KERNELPATH).tar.gz $(KERNELPATH)/. ; \ + rm $(KERNELPATH) + + set -e; \ + $(CONFIG_SHELL) $(srctree)/scripts/mkversion > $(objtree)/.tmp_version;\ + mv -f $(objtree)/.tmp_version $(objtree)/.version; + + $(RPM) -ta ../$(KERNELPATH).tar.gz + rm ../$(KERNELPATH).tar.gz + +# Brief documentation of the typical targets used +# --------------------------------------------------------------------------- + +help: + @echo 'Cleaning targets:' + @echo ' clean - remove most generated files but keep the config' + @echo ' mrproper - remove all generated files + config + various backup files' + @echo '' + @echo 'Configuration targets:' + @$(MAKE) -f $(srctree)/scripts/kconfig/Makefile help + @echo '' + @echo 'Other generic targets:' + @echo ' all - Build all targets marked with [*]' + @echo '* vmlinux - Build the bare kernel' + @echo '* modules - Build all modules' + @echo ' modules_install - Install all modules' + @echo ' dir/ - Build all files in dir and below' + @echo ' dir/file.[ois] - Build specified target only' + @echo ' rpm - Build a kernel as an RPM package' + @echo ' tags/TAGS - Generate tags file for editors' + @echo '' + @echo 'Documentation targets:' + @$(MAKE) -f $(srctree)/Documentation/DocBook/Makefile dochelp + @echo '' + @echo 'Architecture specific targets ($(ARCH)):' + @$(if $(archhelp),$(archhelp),\ + echo ' No architecture specific help defined for $(ARCH)') + @echo '' + @echo ' make V=0|1 [targets] 0 => quiet build (default), 1 => verbose build' + @echo ' make O=dir [targets] Locate all output files in "dir", including .config' + @echo ' make C=1 [targets] Check all c source with checker tool' + @echo '' + @echo 'Execute "make" or "make all" to build all targets marked with [*] ' + @echo 'For further info see the ./README file' + + +# Documentation targets +# --------------------------------------------------------------------------- +%docs: scripts/docproc FORCE + $(Q)$(MAKE) $(build)=Documentation/DocBook $@ + +# Scripts to check various things for consistency +# --------------------------------------------------------------------------- + +configcheck: + find * $(RCS_FIND_IGNORE) \ + -name '*.[hcS]' -type f -print | sort \ + | xargs $(PERL) -w scripts/checkconfig.pl + +includecheck: + find * $(RCS_FIND_IGNORE) \ + -name '*.[hcS]' -type f -print | sort \ + | xargs $(PERL) -w scripts/checkincludes.pl + +versioncheck: + find * $(RCS_FIND_IGNORE) \ + -name '*.[hcS]' -type f -print | sort \ + | xargs $(PERL) -w scripts/checkversion.pl + +endif #ifeq ($(config-targets),1) +endif #ifeq ($(mixed-targets),1) + +# FIXME Should go into a make.lib or something +# =========================================================================== + +a_flags = -Wp,-MD,$(depfile) $(AFLAGS) $(AFLAGS_KERNEL) \ + $(NOSTDINC_FLAGS) $(CPPFLAGS) \ + $(modkern_aflags) $(EXTRA_AFLAGS) $(AFLAGS_$(*F).o) + +quiet_cmd_as_o_S = AS $@ +cmd_as_o_S = $(CC) $(a_flags) -c -o $@ $< + +# read all saved command lines + +targets := $(wildcard $(sort $(targets))) +cmd_files := $(wildcard .*.cmd $(foreach f,$(targets),$(dir $(f)).$(notdir $(f)).cmd)) + +ifneq ($(cmd_files),) + $(cmd_files): ; # Do not try to update included dependency files + include $(cmd_files) +endif + +# execute the command and also postprocess generated .d dependencies +# file + +if_changed_dep = $(if $(strip $? $(filter-out FORCE $(wildcard $^),$^)\ + $(filter-out $(cmd_$(1)),$(cmd_$@))\ + $(filter-out $(cmd_$@),$(cmd_$(1)))),\ + @set -e; \ + $(if $($(quiet)cmd_$(1)),echo ' $(subst ','\'',$($(quiet)cmd_$(1)))';) \ + $(cmd_$(1)); \ + scripts/fixdep $(depfile) $@ '$(subst $$,$$$$,$(subst ','\'',$(cmd_$(1))))' > $(@D)/.$(@F).tmp; \ + rm -f $(depfile); \ + mv -f $(@D)/.$(@F).tmp $(@D)/.$(@F).cmd) + +# Usage: $(call if_changed_rule,foo) +# will check if $(cmd_foo) changed, or any of the prequisites changed, +# and if so will execute $(rule_foo) + +if_changed_rule = $(if $(strip $? \ + $(filter-out $(cmd_$(1)),$(cmd_$(@F)))\ + $(filter-out $(cmd_$(@F)),$(cmd_$(1)))),\ + @$(rule_$(1))) + +# If quiet is set, only print short version of command + +cmd = @$(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))' &&) $(cmd_$(1)) + +# filechk is used to check if the content of a generated file is updated. +# Sample usage: +# define filechk_sample +# echo $KERNELRELEASE +# endef +# version.h : Makefile +# $(call filechk,sample) +# The rule defined shall write to stdout the content of the new file. +# The existing file will be compared with the new one. +# - If no file exist it is created +# - If the content differ the new file is used +# - If they are equal no change, and no timestamp update + +define filechk + @set -e; \ + echo ' CHK $@'; \ + mkdir -p $(dir $@); \ + $(filechk_$(1)) < $< > $@.tmp; \ + if [ -r $@ ] && cmp -s $@ $@.tmp; then \ + rm -f $@.tmp; \ + else \ + echo ' UPD $@'; \ + mv -f $@.tmp $@; \ + fi +endef + +# Shorthand for $(Q)$(MAKE) -f scripts/Makefile.build obj=dir +# Usage: +# $(Q)$(MAKE) $(build)=dir +build := -f $(if $(KBUILD_SRC),$(srctree)/)scripts/Makefile.build obj + +# Shorthand for $(Q)$(MAKE) -f scripts/Makefile.clean obj=dir +# Usage: +# $(Q)$(MAKE) $(clean)=dir +clean := -f $(if $(KBUILD_SRC),$(srctree)/)scripts/Makefile.clean obj + +# $(call descend,,) +# Recursively call a sub-make in with target +# Usage is deprecated, because make does not see this as an invocation of make. +descend =$(Q)$(MAKE) -f $(if $(KBUILD_SRC),$(srctree)/)scripts/Makefile.build obj=$(1) $(2) + +endif # skip-makefile + +FORCE: diff -Nru a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c --- a/arch/alpha/kernel/setup.c Fri Oct 31 14:10:53 2003 +++ b/arch/alpha/kernel/setup.c Fri Oct 31 14:10:53 2003 @@ -486,6 +486,21 @@ hwrpb = (struct hwrpb_struct*) __va(INIT_HWRPB->phys_addr); boot_cpuid = hard_smp_processor_id(); + /* + * Pre-process the system type to make sure it will be valid. + * + * This may restore real CABRIO and EB66+ family names, ie + * EB64+ and EB66. + * + * Oh, and "white box" AS800 (aka DIGITAL Server 3000 series) + * and AS1200 (DIGITAL Server 5000 series) have the type as + * the negative of the real one. + */ + if ((long)hwrpb->sys_type < 0) { + hwrpb->sys_type = -((long)hwrpb->sys_type); + hwrpb_update_checksum(hwrpb); + } + /* Register a call for panic conditions. */ notifier_chain_register(&panic_notifier_list, &alpha_panic_block); diff -Nru a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c --- a/arch/h8300/kernel/time.c Fri Oct 31 14:10:53 2003 +++ b/arch/h8300/kernel/time.c Fri Oct 31 14:10:53 2003 @@ -143,3 +143,9 @@ } EXPORT_SYMBOL(do_settimeofday); + +unsigned long long sched_clock(void) +{ + return (unsigned long long)jiffies * (1000000000 / HZ); + +} diff -Nru a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S --- a/arch/i386/kernel/entry.S Fri Oct 31 14:10:53 2003 +++ b/arch/i386/kernel/entry.S Fri Oct 31 14:10:53 2003 @@ -880,5 +880,10 @@ .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ +#ifdef CONFIG_REISER4_FS + .long sys_reiser4 +#else + .long sys_ni_syscall +#endif nr_syscalls=(.-sys_call_table)/4 diff -Nru a/arch/i386/kernel/entry.S~i386-sys_reiser4.diff b/arch/i386/kernel/entry.S~i386-sys_reiser4.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/i386/kernel/entry.S~i386-sys_reiser4.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,884 @@ +/* + * linux/arch/i386/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'ret_from_system_call': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - orig_eax + * 28(%esp) - %eip + * 2C(%esp) - %cs + * 30(%esp) - %eflags + * 34(%esp) - %oldesp + * 38(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "irq_vectors.h" + +EBX = 0x00 +ECX = 0x04 +EDX = 0x08 +ESI = 0x0C +EDI = 0x10 +EBP = 0x14 +EAX = 0x18 +DS = 0x1C +ES = 0x20 +ORIG_EAX = 0x24 +EIP = 0x28 +CS = 0x2C +EFLAGS = 0x30 +OLDESP = 0x34 +OLDSS = 0x38 + +CF_MASK = 0x00000001 +TF_MASK = 0x00000100 +IF_MASK = 0x00000200 +DF_MASK = 0x00000400 +NT_MASK = 0x00004000 +VM_MASK = 0x00020000 + +/* + * ESP0 is at offset 4. 0x200 is the size of the TSS, and + * also thus the top-of-stack pointer offset of SYSENTER_ESP + */ +TSS_ESP0_OFFSET = (4 - 0x200) + +#ifdef CONFIG_PREEMPT +#define preempt_stop cli +#else +#define preempt_stop +#define resume_kernel restore_all +#endif + +#define SAVE_ALL \ + cld; \ + pushl %es; \ + pushl %ds; \ + pushl %eax; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; + +#define RESTORE_INT_REGS \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %eax + +#define RESTORE_REGS \ + RESTORE_INT_REGS; \ +1: popl %ds; \ +2: popl %es; \ +.section .fixup,"ax"; \ +3: movl $0,(%esp); \ + jmp 1b; \ +4: movl $0,(%esp); \ + jmp 2b; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,3b; \ + .long 2b,4b; \ +.previous + + +#define RESTORE_ALL \ + RESTORE_REGS \ + addl $4, %esp; \ +1: iret; \ +.section .fixup,"ax"; \ +2: sti; \ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; \ + pushl $11; \ + call do_exit; \ +.previous; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,2b; \ +.previous + + + +ENTRY(lcall7) + pushfl # We get a different stack layout with call + # gates, which has to be cleaned up later.. + pushl %eax + SAVE_ALL + movl %esp, %ebp + pushl %ebp + pushl $0x7 +do_lcall: + movl EIP(%ebp), %eax # due to call gates, this is eflags, not eip.. + movl CS(%ebp), %edx # this is eip.. + movl EFLAGS(%ebp), %ecx # and this is cs.. + movl %eax,EFLAGS(%ebp) # + movl %edx,EIP(%ebp) # Now we move them to their "normal" places + movl %ecx,CS(%ebp) # + andl $-8192, %ebp # GET_THREAD_INFO + movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain + call *4(%edx) # Call the lcall7 handler for the domain + addl $4, %esp + popl %eax + jmp resume_userspace + +ENTRY(lcall27) + pushfl # We get a different stack layout with call + # gates, which has to be cleaned up later.. + pushl %eax + SAVE_ALL + movl %esp, %ebp + pushl %ebp + pushl $0x27 + jmp do_lcall + + +ENTRY(ret_from_fork) + pushl %eax + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + jmp syscall_exit + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN +ret_from_exception: + preempt_stop +ret_from_intr: + GET_THREAD_INFO(%ebp) + movl EFLAGS(%esp), %eax # mix EFLAGS and CS + movb CS(%esp), %al + testl $(VM_MASK | 3), %eax + jz resume_kernel # returning to kernel or vm86-space +ENTRY(resume_userspace) + cli # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_FLAGS(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + cmpl $0,TI_PRE_COUNT(%ebp) # non-zero preempt_count ? + jnz restore_all +need_resched: + movl TI_FLAGS(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) + sti + call schedule + movl $0,TI_PRE_COUNT(%ebp) + cli + jmp need_resched +#endif + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(sysenter_entry) + movl TSS_ESP0_OFFSET(%esp),%esp +sysenter_past_esp: + sti + pushl $(__USER_DS) + pushl %ebp + pushfl + pushl $(__USER_CS) + pushl $SYSENTER_RETURN + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + pushl %eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + cmpl $(nr_syscalls), %eax + jae syscall_badsys + + testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + jnz syscall_trace_entry + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) + cli + movl TI_FLAGS(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx + jne syscall_exit_work +/* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx + movl OLDESP(%esp), %ecx + sti + sysexit + + + # system call handler stub +ENTRY(system_call) + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + cmpl $(nr_syscalls), %eax + jae syscall_badsys + # system call tracing in operation + testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp) + jnz syscall_trace_entry +syscall_call: + call *sys_call_table(,%eax,4) + movl %eax,EAX(%esp) # store the return value +syscall_exit: + cli # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_FLAGS(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work +restore_all: + RESTORE_ALL + + # perform work that needs to be done immediately before resumption + ALIGN +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + cli # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_FLAGS(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests + testl $VM_MASK, EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + xorl %edx, %edx + call do_notify_resume + jmp restore_all + + ALIGN +work_notifysig_v86: + pushl %ecx + call save_v86_state + popl %ecx + movl %eax, %esp + xorl %edx, %edx + call do_notify_resume + jmp restore_all + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,EAX(%esp) + movl %esp, %eax + xorl %edx,%edx + call do_syscall_trace + movl ORIG_EAX(%esp), %eax + cmpl $(nr_syscalls), %eax + jnae syscall_call + jmp syscall_exit + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testb $_TIF_SYSCALL_TRACE, %cl + jz work_pending + sti # could let do_syscall_trace() call + # schedule() instead + movl %esp, %eax + movl $1, %edx + call do_syscall_trace + jmp resume_userspace + + ALIGN +syscall_fault: + pushl %eax # save orig_eax + SAVE_ALL + GET_THREAD_INFO(%ebp) + movl $-EFAULT,EAX(%esp) + jmp resume_userspace + + ALIGN +syscall_badsys: + movl $-ENOSYS,EAX(%esp) + jmp resume_userspace + +/* + * Build the entry stubs and pointer table with + * some assembler magic. + */ +.data +ENTRY(interrupt) +.text + +vector=0 +ENTRY(irq_entries_start) +.rept NR_IRQS + ALIGN +1: pushl $vector-256 + jmp common_interrupt +.data + .long 1b +.text +vector=vector+1 +.endr + + ALIGN +common_interrupt: + SAVE_ALL + call do_IRQ + jmp ret_from_intr + +#define BUILD_INTERRUPT(name, nr) \ +ENTRY(name) \ + pushl $nr-256; \ + SAVE_ALL \ + call smp_/**/name; \ + jmp ret_from_intr; + +/* The include is where all of the SMP etc. interrupts come from */ +#include "entry_arch.h" + +ENTRY(divide_error) + pushl $0 # no error code + pushl $do_divide_error + ALIGN +error_code: + pushl %ds + pushl %eax + xorl %eax, %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + decl %eax # eax = -1 + pushl %ecx + pushl %ebx + cld + movl %es, %ecx + movl ORIG_EAX(%esp), %esi # get the error code + movl ES(%esp), %edi # get the function address + movl %eax, ORIG_EAX(%esp) + movl %ecx, ES(%esp) + movl %esp, %edx + pushl %esi # push the error code + pushl %edx # push the pt_regs pointer + movl $(__USER_DS), %edx + movl %edx, %ds + movl %edx, %es + call *%edi + addl $8, %esp + jmp ret_from_exception + +ENTRY(coprocessor_error) + pushl $0 + pushl $do_coprocessor_error + jmp error_code + +ENTRY(simd_coprocessor_error) + pushl $0 + pushl $do_simd_coprocessor_error + jmp error_code + +ENTRY(device_not_available) + pushl $-1 # mark this as an int + SAVE_ALL + movl %cr0, %eax + testl $0x4, %eax # EM (math emulation bit) + jne device_not_available_emulate + preempt_stop + call math_state_restore + jmp ret_from_exception +device_not_available_emulate: + pushl $0 # temporary storage for ORIG_EIP + call math_emulate + addl $4, %esp + jmp ret_from_exception + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_ESP0_OFFSET+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_ESP0_OFFSET+offset(%esp),%esp; \ + pushfl; \ + pushl $__KERNEL_CS; \ + pushl $sysenter_past_esp + +ENTRY(debug) + cmpl $sysenter_entry,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: + pushl $0 + pushl $do_debug + jmp error_code + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +ENTRY(nmi) + cmpl $sysenter_entry,(%esp) + je nmi_stack_fixup + pushl %eax + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $0x1fff,%eax + cmpl $0x1fec,%eax + popl %eax + jae nmi_stack_correct + cmpl $sysenter_entry,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + pushl %eax + SAVE_ALL + movl %esp, %edx + pushl $0 + pushl %edx + call do_nmi + addl $8, %esp + RESTORE_ALL + +nmi_stack_fixup: + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct +nmi_debug_stack_check: + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug - 1,(%esp) + jle nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + jle nmi_debug_stack_fixup +nmi_debug_stack_fixup: + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +ENTRY(int3) + pushl $0 + pushl $do_int3 + jmp error_code + +ENTRY(overflow) + pushl $0 + pushl $do_overflow + jmp error_code + +ENTRY(bounds) + pushl $0 + pushl $do_bounds + jmp error_code + +ENTRY(invalid_op) + pushl $0 + pushl $do_invalid_op + jmp error_code + +ENTRY(coprocessor_segment_overrun) + pushl $0 + pushl $do_coprocessor_segment_overrun + jmp error_code + +ENTRY(invalid_TSS) + pushl $do_invalid_TSS + jmp error_code + +ENTRY(segment_not_present) + pushl $do_segment_not_present + jmp error_code + +ENTRY(stack_segment) + pushl $do_stack_segment + jmp error_code + +ENTRY(general_protection) + pushl $do_general_protection + jmp error_code + +ENTRY(alignment_check) + pushl $do_alignment_check + jmp error_code + +ENTRY(page_fault) + pushl $do_page_fault + jmp error_code + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + pushl $0 + pushl machine_check_vector + jmp error_code +#endif + +ENTRY(spurious_interrupt_bug) + pushl $0 + pushl $do_spurious_interrupt_bug + jmp error_code + +.data +ENTRY(sys_call_table) + .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ + .long sys_exit + .long sys_fork + .long sys_read + .long sys_write + .long sys_open /* 5 */ + .long sys_close + .long sys_waitpid + .long sys_creat + .long sys_link + .long sys_unlink /* 10 */ + .long sys_execve + .long sys_chdir + .long sys_time + .long sys_mknod + .long sys_chmod /* 15 */ + .long sys_lchown16 + .long sys_ni_syscall /* old break syscall holder */ + .long sys_stat + .long sys_lseek + .long sys_getpid /* 20 */ + .long sys_mount + .long sys_oldumount + .long sys_setuid16 + .long sys_getuid16 + .long sys_stime /* 25 */ + .long sys_ptrace + .long sys_alarm + .long sys_fstat + .long sys_pause + .long sys_utime /* 30 */ + .long sys_ni_syscall /* old stty syscall holder */ + .long sys_ni_syscall /* old gtty syscall holder */ + .long sys_access + .long sys_nice + .long sys_ni_syscall /* 35 - old ftime syscall holder */ + .long sys_sync + .long sys_kill + .long sys_rename + .long sys_mkdir + .long sys_rmdir /* 40 */ + .long sys_dup + .long sys_pipe + .long sys_times + .long sys_ni_syscall /* old prof syscall holder */ + .long sys_brk /* 45 */ + .long sys_setgid16 + .long sys_getgid16 + .long sys_signal + .long sys_geteuid16 + .long sys_getegid16 /* 50 */ + .long sys_acct + .long sys_umount /* recycled never used phys() */ + .long sys_ni_syscall /* old lock syscall holder */ + .long sys_ioctl + .long sys_fcntl /* 55 */ + .long sys_ni_syscall /* old mpx syscall holder */ + .long sys_setpgid + .long sys_ni_syscall /* old ulimit syscall holder */ + .long sys_olduname + .long sys_umask /* 60 */ + .long sys_chroot + .long sys_ustat + .long sys_dup2 + .long sys_getppid + .long sys_getpgrp /* 65 */ + .long sys_setsid + .long sys_sigaction + .long sys_sgetmask + .long sys_ssetmask + .long sys_setreuid16 /* 70 */ + .long sys_setregid16 + .long sys_sigsuspend + .long sys_sigpending + .long sys_sethostname + .long sys_setrlimit /* 75 */ + .long sys_old_getrlimit + .long sys_getrusage + .long sys_gettimeofday + .long sys_settimeofday + .long sys_getgroups16 /* 80 */ + .long sys_setgroups16 + .long old_select + .long sys_symlink + .long sys_lstat + .long sys_readlink /* 85 */ + .long sys_uselib + .long sys_swapon + .long sys_reboot + .long old_readdir + .long old_mmap /* 90 */ + .long sys_munmap + .long sys_truncate + .long sys_ftruncate + .long sys_fchmod + .long sys_fchown16 /* 95 */ + .long sys_getpriority + .long sys_setpriority + .long sys_ni_syscall /* old profil syscall holder */ + .long sys_statfs + .long sys_fstatfs /* 100 */ + .long sys_ioperm + .long sys_socketcall + .long sys_syslog + .long sys_setitimer + .long sys_getitimer /* 105 */ + .long sys_newstat + .long sys_newlstat + .long sys_newfstat + .long sys_uname + .long sys_iopl /* 110 */ + .long sys_vhangup + .long sys_ni_syscall /* old "idle" system call */ + .long sys_vm86old + .long sys_wait4 + .long sys_swapoff /* 115 */ + .long sys_sysinfo + .long sys_ipc + .long sys_fsync + .long sys_sigreturn + .long sys_clone /* 120 */ + .long sys_setdomainname + .long sys_newuname + .long sys_modify_ldt + .long sys_adjtimex + .long sys_mprotect /* 125 */ + .long sys_sigprocmask + .long sys_ni_syscall /* old "create_module" */ + .long sys_init_module + .long sys_delete_module + .long sys_ni_syscall /* 130: old "get_kernel_syms" */ + .long sys_quotactl + .long sys_getpgid + .long sys_fchdir + .long sys_bdflush + .long sys_sysfs /* 135 */ + .long sys_personality + .long sys_ni_syscall /* reserved for afs_syscall */ + .long sys_setfsuid16 + .long sys_setfsgid16 + .long sys_llseek /* 140 */ + .long sys_getdents + .long sys_select + .long sys_flock + .long sys_msync + .long sys_readv /* 145 */ + .long sys_writev + .long sys_getsid + .long sys_fdatasync + .long sys_sysctl + .long sys_mlock /* 150 */ + .long sys_munlock + .long sys_mlockall + .long sys_munlockall + .long sys_sched_setparam + .long sys_sched_getparam /* 155 */ + .long sys_sched_setscheduler + .long sys_sched_getscheduler + .long sys_sched_yield + .long sys_sched_get_priority_max + .long sys_sched_get_priority_min /* 160 */ + .long sys_sched_rr_get_interval + .long sys_nanosleep + .long sys_mremap + .long sys_setresuid16 + .long sys_getresuid16 /* 165 */ + .long sys_vm86 + .long sys_ni_syscall /* Old sys_query_module */ + .long sys_poll + .long sys_nfsservctl + .long sys_setresgid16 /* 170 */ + .long sys_getresgid16 + .long sys_prctl + .long sys_rt_sigreturn + .long sys_rt_sigaction + .long sys_rt_sigprocmask /* 175 */ + .long sys_rt_sigpending + .long sys_rt_sigtimedwait + .long sys_rt_sigqueueinfo + .long sys_rt_sigsuspend + .long sys_pread64 /* 180 */ + .long sys_pwrite64 + .long sys_chown16 + .long sys_getcwd + .long sys_capget + .long sys_capset /* 185 */ + .long sys_sigaltstack + .long sys_sendfile + .long sys_ni_syscall /* reserved for streams1 */ + .long sys_ni_syscall /* reserved for streams2 */ + .long sys_vfork /* 190 */ + .long sys_getrlimit + .long sys_mmap2 + .long sys_truncate64 + .long sys_ftruncate64 + .long sys_stat64 /* 195 */ + .long sys_lstat64 + .long sys_fstat64 + .long sys_lchown + .long sys_getuid + .long sys_getgid /* 200 */ + .long sys_geteuid + .long sys_getegid + .long sys_setreuid + .long sys_setregid + .long sys_getgroups /* 205 */ + .long sys_setgroups + .long sys_fchown + .long sys_setresuid + .long sys_getresuid + .long sys_setresgid /* 210 */ + .long sys_getresgid + .long sys_chown + .long sys_setuid + .long sys_setgid + .long sys_setfsuid /* 215 */ + .long sys_setfsgid + .long sys_pivot_root + .long sys_mincore + .long sys_madvise + .long sys_getdents64 /* 220 */ + .long sys_fcntl64 + .long sys_ni_syscall /* reserved for TUX */ + .long sys_ni_syscall + .long sys_gettid + .long sys_readahead /* 225 */ + .long sys_setxattr + .long sys_lsetxattr + .long sys_fsetxattr + .long sys_getxattr + .long sys_lgetxattr /* 230 */ + .long sys_fgetxattr + .long sys_listxattr + .long sys_llistxattr + .long sys_flistxattr + .long sys_removexattr /* 235 */ + .long sys_lremovexattr + .long sys_fremovexattr + .long sys_tkill + .long sys_sendfile64 + .long sys_futex /* 240 */ + .long sys_sched_setaffinity + .long sys_sched_getaffinity + .long sys_set_thread_area + .long sys_get_thread_area + .long sys_io_setup /* 245 */ + .long sys_io_destroy + .long sys_io_getevents + .long sys_io_submit + .long sys_io_cancel + .long sys_fadvise64 /* 250 */ + .long sys_ni_syscall + .long sys_exit_group + .long sys_lookup_dcookie + .long sys_epoll_create + .long sys_epoll_ctl /* 255 */ + .long sys_epoll_wait + .long sys_remap_file_pages + .long sys_set_tid_address + .long sys_timer_create + .long sys_timer_settime /* 260 */ + .long sys_timer_gettime + .long sys_timer_getoverrun + .long sys_timer_delete + .long sys_clock_settime + .long sys_clock_gettime /* 265 */ + .long sys_clock_getres + .long sys_clock_nanosleep + .long sys_statfs64 + .long sys_fstatfs64 + .long sys_tgkill /* 270 */ + .long sys_utimes + .long sys_fadvise64_64 + .long sys_ni_syscall /* sys_vserver */ + +nr_syscalls=(.-sys_call_table)/4 diff -Nru a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c --- a/arch/i386/kernel/sys_i386.c Fri Oct 31 14:10:54 2003 +++ b/arch/i386/kernel/sys_i386.c Fri Oct 31 14:10:54 2003 @@ -56,7 +56,7 @@ } down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + error = do_mmap_pgoff(current->mm, file, addr, len, prot, flags, pgoff); up_write(¤t->mm->mmap_sem); if (file) diff -Nru a/arch/i386/kernel/sys_i386.c~do_mmap2-fix.diff b/arch/i386/kernel/sys_i386.c~do_mmap2-fix.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/i386/kernel/sys_i386.c~do_mmap2-fix.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,253 @@ +/* + * linux/arch/i386/kernel/sys_i386.c + * + * This file contains various random system calls that + * have a non-standard calling sequence on the Linux/i386 + * platform. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * sys_pipe() is the normal C calling standard for creating + * a pipe. It's not the way Unix traditionally does this, though. + */ +asmlinkage int sys_pipe(unsigned long __user * fildes) +{ + int fd[2]; + int error; + + error = do_pipe(fd); + if (!error) { + if (copy_to_user(fildes, fd, 2*sizeof(int))) + error = -EFAULT; + } + return error; +} + +/* common code for old and new mmaps */ +static inline long do_mmap2( + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + int error = -EBADF; + struct file * file = NULL; + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + down_write(¤t->mm->mmap_sem); + error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return error; +} + +asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + return do_mmap2(addr, len, prot, flags, fd, pgoff); +} + +/* + * Perform the select(nd, in, out, ex, tv) and mmap() system + * calls. Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) +{ + struct mmap_arg_struct a; + int err = -EFAULT; + + if (copy_from_user(&a, arg, sizeof(a))) + goto out; + + err = -EINVAL; + if (a.offset & ~PAGE_MASK) + goto out; + + err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); +out: + return err; +} + + +extern asmlinkage int sys_select(int, fd_set __user *, fd_set __user *, fd_set __user *, struct timeval __user *); + +struct sel_arg_struct { + unsigned long n; + fd_set __user *inp, *outp, *exp; + struct timeval __user *tvp; +}; + +asmlinkage int old_select(struct sel_arg_struct __user *arg) +{ + struct sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + /* sys_select() does the appropriate kernel locking */ + return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); +} + +/* + * sys_ipc() is the de-multiplexer for the SysV IPC calls.. + * + * This is really horribly ugly. + */ +asmlinkage int sys_ipc (uint call, int first, int second, + int third, void __user *ptr, long fifth) +{ + int version, ret; + + version = call >> 16; /* hack for backward compatibility */ + call &= 0xffff; + + switch (call) { + case SEMOP: + return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); + case SEMTIMEDOP: + return sys_semtimedop(first, (struct sembuf __user *)ptr, second, + (const struct timespec __user *)fifth); + + case SEMGET: + return sys_semget (first, second, third); + case SEMCTL: { + union semun fourth; + if (!ptr) + return -EINVAL; + if (get_user(fourth.__pad, (void * __user *) ptr)) + return -EFAULT; + return sys_semctl (first, second, third, fourth); + } + + case MSGSND: + return sys_msgsnd (first, (struct msgbuf __user *) ptr, + second, third); + case MSGRCV: + switch (version) { + case 0: { + struct ipc_kludge tmp; + if (!ptr) + return -EINVAL; + + if (copy_from_user(&tmp, + (struct ipc_kludge __user *) ptr, + sizeof (tmp))) + return -EFAULT; + return sys_msgrcv (first, tmp.msgp, second, + tmp.msgtyp, third); + } + default: + return sys_msgrcv (first, + (struct msgbuf __user *) ptr, + second, fifth, third); + } + case MSGGET: + return sys_msgget ((key_t) first, second); + case MSGCTL: + return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); + + case SHMAT: + switch (version) { + default: { + ulong raddr; + ret = sys_shmat (first, (char __user *) ptr, second, &raddr); + if (ret) + return ret; + return put_user (raddr, (ulong __user *) third); + } + case 1: /* iBCS2 emulator entry point */ + if (!segment_eq(get_fs(), get_ds())) + return -EINVAL; + /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ + return sys_shmat (first, (char __user *) ptr, second, (ulong *) third); + } + case SHMDT: + return sys_shmdt ((char __user *)ptr); + case SHMGET: + return sys_shmget (first, second, third); + case SHMCTL: + return sys_shmctl (first, second, + (struct shmid_ds __user *) ptr); + default: + return -ENOSYS; + } +} + +/* + * Old cruft + */ +asmlinkage int sys_uname(struct old_utsname __user * name) +{ + int err; + if (!name) + return -EFAULT; + down_read(&uts_sem); + err=copy_to_user(name, &system_utsname, sizeof (*name)); + up_read(&uts_sem); + return err?-EFAULT:0; +} + +asmlinkage int sys_olduname(struct oldold_utsname __user * name) +{ + int error; + + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) + return -EFAULT; + + down_read(&uts_sem); + + error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); + error |= __put_user(0,name->sysname+__OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); + error |= __put_user(0,name->nodename+__OLD_UTS_LEN); + error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); + error |= __put_user(0,name->release+__OLD_UTS_LEN); + error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); + error |= __put_user(0,name->version+__OLD_UTS_LEN); + error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); + error |= __put_user(0,name->machine+__OLD_UTS_LEN); + + up_read(&uts_sem); + + error = error ? -EFAULT : 0; + + return error; +} diff -Nru a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c --- a/arch/ia64/kernel/efi.c Fri Oct 31 14:10:53 2003 +++ b/arch/ia64/kernel/efi.c Fri Oct 31 14:10:53 2003 @@ -711,6 +711,32 @@ return 0; } +int +valid_phys_addr_range (unsigned long phys_addr, unsigned long *size) +{ + void *efi_map_start, *efi_map_end, *p; + efi_memory_desc_t *md; + u64 efi_desc_size; + + efi_map_start = __va(ia64_boot_param->efi_memmap); + efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; + efi_desc_size = ia64_boot_param->efi_memdesc_size; + + for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { + md = p; + + if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) { + if (!(md->attribute & EFI_MEMORY_WB)) + return 0; + + if (*size > md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr) + *size = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - phys_addr; + return 1; + } + } + return 0; +} + static void __exit efivars_exit (void) { diff -Nru a/arch/ia64/kernel/gate-data.S b/arch/ia64/kernel/gate-data.S --- a/arch/ia64/kernel/gate-data.S Fri Oct 31 14:10:54 2003 +++ b/arch/ia64/kernel/gate-data.S Fri Oct 31 14:10:54 2003 @@ -1,3 +1,3 @@ - .section .data.gate, "ax" + .section .data.gate, "aw" .incbin "arch/ia64/kernel/gate.so" diff -Nru a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c --- a/arch/ia64/kernel/irq.c Fri Oct 31 14:10:54 2003 +++ b/arch/ia64/kernel/irq.c Fri Oct 31 14:10:54 2003 @@ -405,7 +405,7 @@ spin_lock_irqsave(&desc->lock, flags); switch (desc->depth) { case 1: { - unsigned int status = desc->status & ~(IRQ_DISABLED | IRQ_INPROGRESS); + unsigned int status = desc->status & ~IRQ_DISABLED; desc->status = status; if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = status | IRQ_REPLAY; diff -Nru a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c --- a/arch/ia64/kernel/module.c Fri Oct 31 14:10:53 2003 +++ b/arch/ia64/kernel/module.c Fri Oct 31 14:10:53 2003 @@ -322,6 +322,10 @@ void module_free (struct module *mod, void *module_region) { + if (mod->arch.init_unw_table && module_region == mod->module_init) { + unw_remove_unwind_table(mod->arch.init_unw_table); + mod->arch.init_unw_table = NULL; + } vfree(module_region); } @@ -843,28 +847,92 @@ return -ENOEXEC; } +/* + * Modules contain a single unwind table which covers both the core and the init text + * sections but since the two are not contiguous, we need to split this table up such that + * we can register (and unregister) each "segment" seperately. Fortunately, this sounds + * more complicated than it really is. + */ +static void +register_unwind_table (struct module *mod) +{ + struct unw_table_entry *start = (void *) mod->arch.unwind->sh_addr; + struct unw_table_entry *end = start + mod->arch.unwind->sh_size / sizeof (*start); + struct unw_table_entry tmp, *e1, *e2, *core, *init; + unsigned long num_init = 0, num_core = 0; + + /* First, count how many init and core unwind-table entries there are. */ + for (e1 = start; e1 < end; ++e1) + if (in_init(mod, e1->start_offset)) + ++num_init; + else + ++num_core; + /* + * Second, sort the table such that all unwind-table entries for the init and core + * text sections are nicely separated. We do this with a stupid bubble sort + * (unwind tables don't get ridiculously huge). + */ + for (e1 = start; e1 < end; ++e1) { + for (e2 = e1 + 1; e2 < end; ++e2) { + if (e2->start_offset < e1->start_offset) { + tmp = *e1; + *e1 = *e2; + *e2 = tmp; + } + } + } + /* + * Third, locate the init and core segments in the unwind table: + */ + if (in_init(mod, start->start_offset)) { + init = start; + core = start + num_init; + } else { + core = start; + init = start + num_core; + } + + DEBUGP("%s: name=%s, gp=%lx, num_init=%lu, num_core=%lu\n", __FUNCTION__, + mod->name, mod->arch.gp, num_init, num_core); + + /* + * Fourth, register both tables (if not empty). + */ + if (num_core > 0) { + mod->arch.core_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp, + core, core + num_core); + DEBUGP("%s: core: handle=%p [%p-%p)\n", __FUNCTION__, + mod->arch.core_unw_table, core, core + num_core); + } + if (num_init > 0) { + mod->arch.init_unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp, + init, init + num_init); + DEBUGP("%s: init: handle=%p [%p-%p)\n", __FUNCTION__, + mod->arch.init_unw_table, init, init + num_init); + } +} + int module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { DEBUGP("%s: init: entry=%p\n", __FUNCTION__, mod->init); if (mod->arch.unwind) - mod->arch.unw_table = unw_add_unwind_table(mod->name, 0, mod->arch.gp, - (void *) mod->arch.unwind->sh_addr, - ((void *) mod->arch.unwind->sh_addr - + mod->arch.unwind->sh_size)); + register_unwind_table(mod); return 0; } void module_arch_cleanup (struct module *mod) { - if (mod->arch.unwind) - unw_remove_unwind_table(mod->arch.unw_table); + if (mod->arch.init_unw_table) + unw_remove_unwind_table(mod->arch.init_unw_table); + if (mod->arch.core_unw_table) + unw_remove_unwind_table(mod->arch.core_unw_table); } #ifdef CONFIG_SMP void -percpu_modcopy (void *pcpudst, const void *src, unsigned long size) +percpu_modcopy (void *pcpudst, const void *src, unsigned long size) { unsigned int i; for (i = 0; i < NR_CPUS; i++) diff -Nru a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c --- a/arch/ia64/kernel/perfmon.c Fri Oct 31 14:10:54 2003 +++ b/arch/ia64/kernel/perfmon.c Fri Oct 31 14:10:54 2003 @@ -202,8 +202,8 @@ #define GET_PMU_OWNER() pfm_get_cpu_var(pmu_owner) #define GET_PMU_CTX() pfm_get_cpu_var(pmu_ctx) -#define LOCK_PFS() spin_lock(&pfm_sessions.pfs_lock) -#define UNLOCK_PFS() spin_unlock(&pfm_sessions.pfs_lock) +#define LOCK_PFS(g) spin_lock_irqsave(&pfm_sessions.pfs_lock, g) +#define UNLOCK_PFS(g) spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g) #define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0) @@ -618,6 +618,7 @@ .get_sb = pfmfs_get_sb, .kill_sb = kill_anon_super, }; + DEFINE_PER_CPU(unsigned long, pfm_syst_info); DEFINE_PER_CPU(struct task_struct *, pmu_owner); DEFINE_PER_CPU(pfm_context_t *, pmu_ctx); @@ -634,6 +635,8 @@ static void pfm_lazy_save_regs (struct task_struct *ta); #endif +void dump_pmu_state(const char *); + /* * the HP simulator must be first because * CONFIG_IA64_HP_SIM is independent of CONFIG_MCKINLEY or CONFIG_ITANIUM @@ -1283,10 +1286,11 @@ static int pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu) { + unsigned long flags; /* * validy checks on cpu_mask have been done upstream */ - LOCK_PFS(); + LOCK_PFS(flags); DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", pfm_sessions.pfs_sys_sessions, @@ -1325,7 +1329,7 @@ is_syswide, cpu)); - UNLOCK_PFS(); + UNLOCK_PFS(flags); return 0; @@ -1334,7 +1338,7 @@ pfm_sessions.pfs_sys_session[cpu]->pid, smp_processor_id())); abort: - UNLOCK_PFS(); + UNLOCK_PFS(flags); return -EBUSY; @@ -1343,11 +1347,11 @@ static int pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu) { - + unsigned long flags; /* * validy checks on cpu_mask have been done upstream */ - LOCK_PFS(); + LOCK_PFS(flags); DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", pfm_sessions.pfs_sys_sessions, @@ -1380,7 +1384,7 @@ is_syswide, cpu)); - UNLOCK_PFS(); + UNLOCK_PFS(flags); return 0; } @@ -1655,7 +1659,7 @@ } /* - * context is locked when coming here + * context is locked when coming here and interrupts are disabled */ static inline int pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on) @@ -1789,6 +1793,7 @@ * even if the task itself is in the middle of being ctxsw out. */ static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); + static int pfm_close(struct inode *inode, struct file *filp) { @@ -1803,10 +1808,6 @@ int free_possible = 1; int state, is_system; - { u64 psr = pfm_get_psr(); - BUG_ON((psr & IA64_PSR_I) == 0UL); - } - DPRINT(("pfm_close called private=%p\n", filp->private_data)); if (!inode) { @@ -1815,7 +1816,7 @@ } if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_close: bad magic [%d]\n", current->pid); + DPRINT(("bad magic for [%d]\n", current->pid)); return -EBADF; } @@ -1824,6 +1825,23 @@ printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid); return -EBADF; } + /* + * remove our file from the async queue, if we use this mode. + * This can be done without the context being protected. We come + * here when the context has become unreacheable by other tasks. + * + * We may still have active monitoring at this point and we may + * end up in pfm_overflow_handler(). However, fasync_helper() + * operates with interrupts disabled and it cleans up the + * queue. If the PMU handler is called prior to entering + * fasync_helper() then it will send a signal. If it is + * invoked after, it will find an empty queue and no + * signal will be sent. In both case, we are safe + */ + if (filp->f_flags & FASYNC) { + DPRINT(("[%d] cleaning up async_queue=%p\n", current->pid, ctx->ctx_async_queue)); + pfm_do_fasync (-1, filp, ctx, 0); + } PROTECT_CTX(ctx, flags); @@ -1832,24 +1850,17 @@ task = PFM_CTX_TASK(ctx); - /* - * remove our file from the async queue, if we use it - */ - if (filp->f_flags & FASYNC) { - DPRINT(("[%d] before async_queue=%p\n", current->pid, ctx->ctx_async_queue)); - pfm_do_fasync (-1, filp, ctx, 0); - DPRINT(("[%d] after async_queue=%p\n", current->pid, ctx->ctx_async_queue)); - } + regs = ia64_task_regs(task); - DPRINT(("[%d] ctx_state=%d\n", current->pid, state)); + DPRINT(("[%d] ctx_state=%d is_current=%d\n", + current->pid, state, + task == current ? 1 : 0)); if (state == PFM_CTX_UNLOADED || state == PFM_CTX_TERMINATED) { goto doit; } - regs = ia64_task_regs(task); - /* * context still loaded/masked and self monitoring, * we stop/unload and we destroy right here @@ -1898,12 +1909,11 @@ ctx->ctx_state = PFM_CTX_TERMINATED; - DPRINT(("[%d] ctx_state=%d\n", current->pid, state)); + DPRINT(("[%d] ctx_state=%d\n", current->pid, ctx->ctx_state)); } goto doit; } - /* * The task is currently blocked or will block after an overflow. * we must force it to wakeup to get out of the @@ -3482,6 +3492,7 @@ pfm_use_debug_registers(struct task_struct *task) { pfm_context_t *ctx = task->thread.pfm_context; + unsigned long flags; int ret = 0; if (pmu_conf.use_rr_dbregs == 0) return 0; @@ -3503,7 +3514,7 @@ */ if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1; - LOCK_PFS(); + LOCK_PFS(flags); /* * We cannot allow setting breakpoints when system wide monitoring @@ -3519,7 +3530,7 @@ pfm_sessions.pfs_sys_use_dbregs, task->pid, ret)); - UNLOCK_PFS(); + UNLOCK_PFS(flags); return ret; } @@ -3535,11 +3546,12 @@ int pfm_release_debug_registers(struct task_struct *task) { + unsigned long flags; int ret; if (pmu_conf.use_rr_dbregs == 0) return 0; - LOCK_PFS(); + LOCK_PFS(flags); if (pfm_sessions.pfs_ptrace_use_dbregs == 0) { printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid); ret = -1; @@ -3547,7 +3559,7 @@ pfm_sessions.pfs_ptrace_use_dbregs--; ret = 0; } - UNLOCK_PFS(); + UNLOCK_PFS(flags); return ret; } @@ -3723,7 +3735,6 @@ memset(pfm_stats, 0, sizeof(pfm_stats)); for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL; } - return 0; } @@ -3735,6 +3746,7 @@ { struct thread_struct *thread = NULL; pfarg_dbreg_t *req = (pfarg_dbreg_t *)arg; + unsigned long flags; dbreg_t dbreg; unsigned int rnum; int first_time; @@ -3793,7 +3805,7 @@ * written after the context is loaded */ if (is_loaded) { - LOCK_PFS(); + LOCK_PFS(flags); if (first_time && is_system) { if (pfm_sessions.pfs_ptrace_use_dbregs) @@ -3801,7 +3813,7 @@ else pfm_sessions.pfs_sys_use_dbregs++; } - UNLOCK_PFS(); + UNLOCK_PFS(flags); } if (ret != 0) return ret; @@ -3902,11 +3914,11 @@ * in case it was our first attempt, we undo the global modifications */ if (first_time) { - LOCK_PFS(); + LOCK_PFS(flags); if (ctx->ctx_fl_system) { pfm_sessions.pfs_sys_use_dbregs--; } - UNLOCK_PFS(); + UNLOCK_PFS(flags); ctx->ctx_fl_using_dbreg = 0; } /* @@ -3959,7 +3971,11 @@ DPRINT(("[%d] should be running on CPU%d\n", current->pid, ctx->ctx_cpu)); return -EBUSY; } - + DPRINT(("current [%d] task [%d] ctx_state=%d is_system=%d\n", + current->pid, + PFM_CTX_TASK(ctx)->pid, + state, + is_system)); /* * in system mode, we need to update the PMU directly * and the user level state of the caller, which may not @@ -4157,6 +4173,7 @@ struct task_struct *task; struct thread_struct *thread; struct pfm_context_t *old; + unsigned long flags; #ifndef CONFIG_SMP struct task_struct *owner_task = NULL; #endif @@ -4217,7 +4234,7 @@ DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid)); goto error; } - LOCK_PFS(); + LOCK_PFS(flags); if (is_system) { if (pfm_sessions.pfs_ptrace_use_dbregs) { @@ -4230,7 +4247,7 @@ } } - UNLOCK_PFS(); + UNLOCK_PFS(flags); if (ret) goto error; } @@ -4377,9 +4394,9 @@ * we must undo the dbregs setting (for system-wide) */ if (ret && set_dbregs) { - LOCK_PFS(); + LOCK_PFS(flags); pfm_sessions.pfs_sys_use_dbregs--; - UNLOCK_PFS(); + UNLOCK_PFS(flags); } /* * release task, there is now a link with the context @@ -4605,11 +4622,14 @@ printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state); break; } + UNPROTECT_CTX(ctx, flags); + { u64 psr = pfm_get_psr(); BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); BUG_ON(GET_PMU_OWNER()); + BUG_ON(ia64_psr(regs)->up); + BUG_ON(ia64_psr(regs)->pp); } - UNPROTECT_CTX(ctx, flags); /* * All memory free operations (especially for vmalloc'ed memory) @@ -4697,7 +4717,7 @@ /* * context is UNLOADED, MASKED, TERMINATED we are safe to go */ - if (state != PFM_CTX_LOADED == 0) return 0; + if (state != PFM_CTX_LOADED) return 0; if (state == PFM_CTX_ZOMBIE) return -EINVAL; @@ -5488,7 +5508,7 @@ char *p = page; struct list_head * pos; pfm_buffer_fmt_t * entry; - unsigned long psr; + unsigned long psr, flags; int online_cpus = 0; int i; @@ -5528,7 +5548,7 @@ } } - LOCK_PFS(); + LOCK_PFS(flags); p += sprintf(p, "proc_sessions : %u\n" "sys_sessions : %u\n" "sys_use_dbregs : %u\n" @@ -5537,7 +5557,7 @@ pfm_sessions.pfs_sys_sessions, pfm_sessions.pfs_sys_use_dbregs, pfm_sessions.pfs_ptrace_use_dbregs); - UNLOCK_PFS(); + UNLOCK_PFS(flags); spin_lock(&pfm_buffer_fmt_lock); @@ -5712,10 +5732,6 @@ */ ctx->ctx_saved_psr_up = psr & IA64_PSR_UP; - { u64 foo = pfm_get_psr(); - BUG_ON(foo & ((IA64_PSR_UP|IA64_PSR_PP))); - } - /* * release ownership of this PMU. * PM interrupts are masked, so nothing @@ -5771,6 +5787,8 @@ */ psr = pfm_get_psr(); + BUG_ON(psr & (IA64_PSR_I)); + /* * stop monitoring: * This is the last instruction which may generate an overflow @@ -5785,12 +5803,6 @@ */ ctx->ctx_saved_psr_up = psr & IA64_PSR_UP; -#if 1 - { u64 foo = pfm_get_psr(); - BUG_ON(foo & (IA64_PSR_I)); - BUG_ON(foo & ((IA64_PSR_UP|IA64_PSR_PP))); - } -#endif return; save_error: printk(KERN_ERR "perfmon: pfm_save_regs CPU%d [%d] NULL context PM_VALID=%ld\n", @@ -5805,11 +5817,9 @@ struct thread_struct *t; unsigned long flags; -#if 1 - { u64 foo = pfm_get_psr(); - BUG_ON(foo & IA64_PSR_UP); + { u64 psr = pfm_get_psr(); + BUG_ON(psr & IA64_PSR_UP); } -#endif ctx = PFM_GET_CTX(task); t = &task->thread; @@ -5851,7 +5861,7 @@ /* * unfreeze PMU if had pending overflows */ - if (t->pmcs[0] & ~1UL) pfm_unfreeze_pmu(); + if (t->pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); /* * now get can unmask PMU interrupts, they will @@ -5900,10 +5910,8 @@ flags = pfm_protect_ctx_ctxsw(ctx); psr = pfm_get_psr(); -#if 1 BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); BUG_ON(psr & IA64_PSR_I); -#endif if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) { struct pt_regs *regs = ia64_task_regs(task); @@ -6060,10 +6068,8 @@ t = &task->thread; psr = pfm_get_psr(); -#if 1 BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); BUG_ON(psr & IA64_PSR_I); -#endif /* * we restore ALL the debug registers to avoid picking up @@ -6218,7 +6224,7 @@ /* * clear whatever overflow status bits there were */ - task->thread.pmcs[0] &= ~0x1; + task->thread.pmcs[0] = 0; } ovfl_val = pmu_conf.ovfl_val; /* @@ -6400,6 +6406,11 @@ pfm_clear_psr_pp(); pfm_clear_psr_up(); + /* + * we run with the PMU not frozen at all times + */ + pfm_unfreeze_pmu(); + if (smp_processor_id() == 0) register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); @@ -6427,49 +6438,75 @@ if (PMD_IS_IMPL(i) == 0) continue; ia64_set_pmd(i, 0UL); } - - /* - * we run with the PMU not frozen at all times - */ - pfm_unfreeze_pmu(); } /* * used for debug purposes only */ void -dump_pmu_state(void) +dump_pmu_state(const char *from) { struct task_struct *task; struct thread_struct *t; + struct pt_regs *regs; pfm_context_t *ctx; - unsigned long psr; - int i; + unsigned long psr, dcr, info, flags; + int i, this_cpu; + + local_irq_save(flags); - printk("current [%d] %s\n", current->pid, current->comm); + this_cpu = smp_processor_id(); + regs = ia64_task_regs(current); + info = PFM_CPUINFO_GET(); + dcr = ia64_getreg(_IA64_REG_CR_DCR); + + if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) { + local_irq_restore(flags); + return; + } + + printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", + this_cpu, + from, + current->pid, + regs->cr_iip, + current->comm); task = GET_PMU_OWNER(); ctx = GET_PMU_CTX(); - printk("owner [%d] ctx=%p\n", task ? task->pid : -1, ctx); + printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx); psr = pfm_get_psr(); - printk("psr.pp=%ld psr.up=%ld\n", (psr >> IA64_PSR_PP_BIT) &0x1UL, (psr >> IA64_PSR_PP_BIT)&0x1UL); + printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n", + this_cpu, + ia64_get_pmc(0), + psr & IA64_PSR_PP ? 1 : 0, + psr & IA64_PSR_UP ? 1 : 0, + dcr & IA64_DCR_PP ? 1 : 0, + info, + ia64_psr(regs)->up, + ia64_psr(regs)->pp); + + ia64_psr(regs)->up = 0; + ia64_psr(regs)->pp = 0; t = ¤t->thread; for (i=1; PMC_IS_LAST(i) == 0; i++) { if (PMC_IS_IMPL(i) == 0) continue; - printk("pmc[%d]=0x%lx tpmc=0x%lx\n", i, ia64_get_pmc(i), t->pmcs[i]); + printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, t->pmcs[i]); } for (i=1; PMD_IS_LAST(i) == 0; i++) { if (PMD_IS_IMPL(i) == 0) continue; - printk("pmd[%d]=0x%lx tpmd=0x%lx\n", i, ia64_get_pmd(i), t->pmds[i]); + printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, t->pmds[i]); } + if (ctx) { - printk("ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n", + printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n", + this_cpu, ctx->ctx_state, ctx->ctx_smpl_vaddr, ctx->ctx_smpl_hdr, @@ -6477,6 +6514,7 @@ ctx->ctx_msgq_tail, ctx->ctx_saved_psr_up); } + local_irq_restore(flags); } /* @@ -6499,10 +6537,8 @@ PFM_SET_WORK_PENDING(task, 0); /* - * restore default psr settings + * the psr bits are already set properly in copy_threads() */ - ia64_psr(regs)->pp = ia64_psr(regs)->up = 0; - ia64_psr(regs)->sp = 1; } #else /* !CONFIG_PERFMON */ asmlinkage long diff -Nru a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h --- a/arch/ia64/kernel/perfmon_mckinley.h Fri Oct 31 14:10:54 2003 +++ b/arch/ia64/kernel/perfmon_mckinley.h Fri Oct 31 14:10:54 2003 @@ -167,7 +167,7 @@ val14 = ctx->ctx_pmcs[14]; check_case1 = 1; break; - case 14: val8 = ctx->ctx_pmcs[13]; + case 14: val8 = ctx->ctx_pmcs[8]; val13 = ctx->ctx_pmcs[13]; val14 = *val; check_case1 = 1; diff -Nru a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c --- a/arch/ia64/kernel/process.c Fri Oct 31 14:10:53 2003 +++ b/arch/ia64/kernel/process.c Fri Oct 31 14:10:53 2003 @@ -353,9 +353,13 @@ /* copy parts of thread_struct: */ p->thread.ksp = (unsigned long) child_stack - 16; - /* stop some PSR bits from being inherited: */ + /* stop some PSR bits from being inherited. + * the psr.up/psr.pp bits must be cleared on fork but inherited on execve() + * therefore we must specify them explicitly here and not include them in + * IA64_PSR_BITS_TO_CLEAR. + */ child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) - & ~IA64_PSR_BITS_TO_CLEAR); + & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); /* * NOTE: The calling convention considers all floating point diff -Nru a/arch/ia64/kernel/unwind_i.h b/arch/ia64/kernel/unwind_i.h --- a/arch/ia64/kernel/unwind_i.h Fri Oct 31 14:10:53 2003 +++ b/arch/ia64/kernel/unwind_i.h Fri Oct 31 14:10:53 2003 @@ -1,5 +1,5 @@ /* - * Copyright (C) 2000, 2002 Hewlett-Packard Co + * Copyright (C) 2000, 2002-2003 Hewlett-Packard Co * David Mosberger-Tang * * Kernel unwind support. @@ -43,12 +43,6 @@ u64 header; u64 desc[0]; /* unwind descriptors */ /* personality routine and language-specific data follow behind descriptors */ -}; - -struct unw_table_entry { - u64 start_offset; - u64 end_offset; - u64 info_offset; }; struct unw_table { diff -Nru a/arch/sparc/kernel/entry.S b/arch/sparc/kernel/entry.S --- a/arch/sparc/kernel/entry.S Fri Oct 31 14:10:53 2003 +++ b/arch/sparc/kernel/entry.S Fri Oct 31 14:10:53 2003 @@ -38,7 +38,7 @@ #define curptr g6 -#define NR_SYSCALLS 268 /* Each OS is different... */ +#define NR_SYSCALLS 272 /* Each OS is different... */ /* These are just handy. */ #define _SV save %sp, -STACKFRAME_SZ, %sp diff -Nru a/arch/sparc/kernel/systbls.S b/arch/sparc/kernel/systbls.S --- a/arch/sparc/kernel/systbls.S Fri Oct 31 14:10:53 2003 +++ b/arch/sparc/kernel/systbls.S Fri Oct 31 14:10:53 2003 @@ -72,7 +72,8 @@ /*250*/ .long sparc_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl /*255*/ .long sys_nis_syscall, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep /*260*/ .long sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun -/*265*/ .long sys_timer_delete, sys_timer_create, sys_nis_syscall, sys_nis_syscall +/*265*/ .long sys_timer_delete, sys_timer_create, sys_nis_syscall, sys_io_setup, sys_io_destroy +/*270*/ .long sys_io_submit, sys_io_cancel, sys_io_getevents, sys_nis_syscall #ifdef CONFIG_SUNOS_EMUL /* Now the SunOS syscall table. */ @@ -172,5 +173,8 @@ /*260*/ .long sunos_nosys, sunos_nosys, sunos_nosys .long sunos_nosys, sunos_nosys, sunos_nosys .long sunos_nosys, sunos_nosys, sunos_nosys + .long sunos_nosys +/*270*/ .long sunos_nosys, sunos_nosys, sunos_nosys + .long sunos_nosys #endif diff -Nru a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig --- a/arch/sparc64/Kconfig Fri Oct 31 14:10:54 2003 +++ b/arch/sparc64/Kconfig Fri Oct 31 14:10:54 2003 @@ -813,7 +813,7 @@ # the generic version in that case. config HAVE_DEC_LOCK bool - depends on !DEBUG_SPINLOCK + depends on SMP && !DEBUG_SPINLOCK default y config DEBUG_SPINLOCK_SLEEP diff -Nru a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S --- a/arch/sparc64/kernel/entry.S Fri Oct 31 14:10:53 2003 +++ b/arch/sparc64/kernel/entry.S Fri Oct 31 14:10:53 2003 @@ -26,7 +26,7 @@ #define curptr g6 -#define NR_SYSCALLS 268 /* Each OS is different... */ +#define NR_SYSCALLS 272 /* Each OS is different... */ .text .align 32 diff -Nru a/arch/sparc64/kernel/rtrap.S b/arch/sparc64/kernel/rtrap.S --- a/arch/sparc64/kernel/rtrap.S Fri Oct 31 14:10:53 2003 +++ b/arch/sparc64/kernel/rtrap.S Fri Oct 31 14:10:53 2003 @@ -270,9 +270,14 @@ #ifdef CONFIG_PREEMPT ldsw [%g6 + TI_PRE_COUNT], %l5 brnz %l5, kern_fpucheck + ldx [%g6 + TI_FLAGS], %l5 + andcc %l5, _TIF_NEED_RESCHED, %g0 + be,pt %xcc, kern_fpucheck + srl %l4, 20, %l5 + cmp %l5, 0 + bne,pn %xcc, kern_fpucheck sethi %hi(PREEMPT_ACTIVE), %l6 stw %l6, [%g6 + TI_PRE_COUNT] - wrpr 0, %pil call schedule nop ba,pt %xcc, rtrap diff -Nru a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c --- a/arch/sparc64/kernel/sparc64_ksyms.c Fri Oct 31 14:10:54 2003 +++ b/arch/sparc64/kernel/sparc64_ksyms.c Fri Oct 31 14:10:54 2003 @@ -136,6 +136,7 @@ EXPORT_SYMBOL(__read_unlock); EXPORT_SYMBOL(__write_lock); EXPORT_SYMBOL(__write_unlock); +EXPORT_SYMBOL(__write_trylock); #endif /* Hard IRQ locking */ diff -Nru a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S --- a/arch/sparc64/kernel/systbls.S Fri Oct 31 14:10:53 2003 +++ b/arch/sparc64/kernel/systbls.S Fri Oct 31 14:10:53 2003 @@ -72,7 +72,8 @@ /*250*/ .word sys32_mremap, sys32_sysctl, sys_getsid, sys_fdatasync, sys32_nfsservctl .word sys_ni_syscall, compat_clock_settime, compat_clock_gettime, compat_clock_getres, compat_clock_nanosleep /*260*/ .word compat_sys_sched_getaffinity, compat_sys_sched_setaffinity, compat_timer_settime, compat_timer_gettime, sys_timer_getoverrun - .word sys_timer_delete, sys32_timer_create, sys_ni_syscall, sys_ni_syscall + .word sys_timer_delete, sys32_timer_create, sys_ni_syscall, sys_ni_syscall, sys_ni_syscall +/*270*/ .word sys_ni_syscall, sys_ni_syscall, sys_ni_syscall, sys_ni_syscall /* Now the 64-bit native Linux syscall table. */ @@ -133,7 +134,8 @@ /*250*/ .word sys64_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl .word sys_ni_syscall, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep /*260*/ .word sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun - .word sys_timer_delete, sys_timer_create, sys_ni_syscall, sys_ni_syscall + .word sys_timer_delete, sys_timer_create, sys_ni_syscall, sys_io_setup, sys_io_destroy +/*270*/ .word sys_io_submit, sys_io_cancel, sys_io_getevents, sys_ni_syscall #if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \ defined(CONFIG_SOLARIS_EMUL_MODULE) @@ -233,6 +235,7 @@ .word sunos_nosys, sunos_nosys, sunos_nosys .word sunos_nosys, sunos_nosys, sunos_nosys .word sunos_nosys, sunos_nosys, sunos_nosys - .word sunos_nosys + .word sunos_nosys, sunos_nosys, sunos_nosys + .word sunos_nosys, sunos_nosys, sunos_nosys #endif diff -Nru a/arch/sparc64/lib/dec_and_lock.S b/arch/sparc64/lib/dec_and_lock.S --- a/arch/sparc64/lib/dec_and_lock.S Fri Oct 31 14:10:53 2003 +++ b/arch/sparc64/lib/dec_and_lock.S Fri Oct 31 14:10:53 2003 @@ -29,7 +29,7 @@ atomic_dec_and_lock: /* %o0 = counter, %o1 = lock */ loop1: lduw [%o0], %g5 subcc %g5, 1, %g7 - be,pn %icc, to_zero + be,pn %icc, start_to_zero nop nzero: cas [%o0], %g5, %g7 cmp %g5, %g7 @@ -40,6 +40,7 @@ membar #StoreLoad | #StoreStore retl mov %g1, %o0 +start_to_zero: #ifdef CONFIG_PREEMPT ldsw [%g6 + TI_PRE_COUNT], %g3 add %g3, 1, %g3 diff -Nru a/arch/sparc64/lib/rwlock.S b/arch/sparc64/lib/rwlock.S --- a/arch/sparc64/lib/rwlock.S Fri Oct 31 14:10:53 2003 +++ b/arch/sparc64/lib/rwlock.S Fri Oct 31 14:10:53 2003 @@ -63,5 +63,27 @@ be,pt %icc, 99b membar #StoreLoad | #StoreStore ba,a,pt %xcc, 1b + + .globl __write_trylock +__write_trylock: /* %o0 = lock_ptr */ + sethi %hi(0x80000000), %g2 +1: lduw [%o0], %g5 + brnz,pn %g5, __write_trylock_fail +4: or %g5, %g2, %g7 + + cas [%o0], %g5, %g7 + cmp %g5, %g7 + be,pt %icc, __write_trylock_succeed + membar #StoreLoad | #StoreStore + + ba,pt %xcc, 1b + nop +__write_trylock_succeed: + retl + mov 1, %o0 + +__write_trylock_fail: + retl + mov 0, %o0 rwlock_impl_end: diff -Nru a/arch/um/Kconfig b/arch/um/Kconfig --- a/arch/um/Kconfig Fri Oct 31 14:10:53 2003 +++ b/arch/um/Kconfig Fri Oct 31 14:10:53 2003 @@ -61,6 +61,20 @@ config NET bool "Networking support" + help + Unless you really know what you are doing, you should say Y here. + The reason is that some programs need kernel networking support even + when running on a stand-alone machine that isn't connected to any + other computer. If you are upgrading from an older kernel, you + should consider updating your networking tools too because changes + in the kernel and the tools often go hand in hand. The tools are + contained in the package net-tools, the location and version number + of which are given in Documentation/Changes. + + For a general introduction to Linux networking, it is highly + recommended to read the NET-HOWTO, available from + . + source "fs/Kconfig.binfmt" @@ -85,6 +99,19 @@ If you'd like to be able to work with files stored on the host, say Y or M here; otherwise say N. +config HPPFS + tristate "HoneyPot ProcFS" + help + hppfs (HoneyPot ProcFS) is a filesystem which allows UML /proc + entries to be overridden, removed, or fabricated from the host. + Its purpose is to allow a UML to appear to be a physical machine + by removing or changing anything in /proc which gives away the + identity of a UML. + + See http://user-mode-linux.sf.net/hppfs.html for more information. + + You only need this if you are setting up a UML honeypot. Otherwise, + it is safe to say 'N' here. config MCONSOLE bool "Management console" @@ -105,6 +132,16 @@ config MAGIC_SYSRQ bool "Magic SysRq key" depends on MCONSOLE + help + If you say Y here, you will have some control over the system even + if the system crashes for example during kernel debugging (e.g., you + will be able to flush the buffer cache to disk, reboot the system + immediately or dump some status information). This is accomplished + by pressing various keys while holding SysRq (Alt+PrintScreen). It + also works on a serial console (on PC hardware at least), if you + send a BREAK and then within 5 seconds a command keypress. The + keys are documented in Documentation/sysrq.txt. Don't say Y + unless you really know what this hack does. config HOST_2G_2G bool "2G/2G host address space split" @@ -159,6 +196,9 @@ config HIGHMEM bool "Highmem support" +config PROC_MM + bool "/proc/mm support" + config KERNEL_STACK_ORDER int "Kernel stack size order" default 2 @@ -181,10 +221,10 @@ bool default NET -source "arch/um/Kconfig_net" - source "net/Kconfig" +source "arch/um/Kconfig_net" + source "fs/Kconfig" source "security/Kconfig" @@ -239,6 +279,10 @@ config PT_PROXY bool "Enable ptrace proxy" depends on XTERM_CHAN && DEBUG_INFO + help + This option enables a debugging interface which allows gdb to debug + the kernel without needing to actually attach to kernel threads. + If you want to do kernel debugging, say Y here; otherwise say N. config GPROF bool "Enable gprof support" diff -Nru a/arch/um/Kconfig_block b/arch/um/Kconfig_block --- a/arch/um/Kconfig_block Fri Oct 31 14:10:54 2003 +++ b/arch/um/Kconfig_block Fri Oct 31 14:10:54 2003 @@ -29,6 +29,21 @@ wise choice too. In all other cases (for example, if you're just playing around with User-Mode Linux) you can choose N. +# Turn this back on when the driver actually works +# +#config BLK_DEV_COW +# tristate "COW block device" +# help +# This is a layered driver which sits above two other block devices. +# One is read-only, and the other is a read-write layer which stores +# all changes. This provides the illusion that the read-only layer +# can be mounted read-write and changed. + +config BLK_DEV_COW_COMMON + bool + default no +# default BLK_DEV_COW || BLK_DEV_UBD + config BLK_DEV_LOOP tristate "Loopback device support" diff -Nru a/arch/um/Kconfig_block~uml-kill-cow.diff b/arch/um/Kconfig_block~uml-kill-cow.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/Kconfig_block~uml-kill-cow.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,82 @@ + +menu "Block Devices" + +config BLK_DEV_UBD + bool "Virtual block device" + help + The User-Mode Linux port includes a driver called UBD which will let + you access arbitrary files on the host computer as block devices. + Unless you know that you do not need such virtual block devices say + Y here. + +config BLK_DEV_UBD_SYNC + bool "Always do synchronous disk IO for UBD" + depends on BLK_DEV_UBD + help + Writes to the virtual block device are not immediately written to the + host's disk; this may cause problems if, for example, the + User-Mode Linux 'Virtual Machine' uses a journalling filesystem and + the host computer crashes. + + Synchronous operation (i.e. always writing data to the host's disk + immediately) is configurable on a per-UBD basis by using a special + kernel command line option. Alternatively, you can say Y here to + turn on synchronous operation by default for all block devices. + + If you're running a journalling file system (like reiserfs, for + example) in your virtual machine, you will want to say Y here. If + you care for the safety of the data in your virtual machine, Y is a + wise choice too. In all other cases (for example, if you're just + playing around with User-Mode Linux) you can choose N. + +# Turn this back on when the driver actually works +# +#config BLK_DEV_COW +# tristate "COW block device" +# help +# This is a layered driver which sits above two other block devices. +# One is read-only, and the other is a read-write layer which stores +# all changes. This provides the illusion that the read-only layer +# can be mounted read-write and changed. + +config BLK_DEV_COW_COMMON + bool + default BLK_DEV_COW || BLK_DEV_UBD + +config BLK_DEV_LOOP + tristate "Loopback device support" + +config BLK_DEV_NBD + tristate "Network block device support" + depends on NET + +config BLK_DEV_RAM + tristate "RAM disk support" + +config BLK_DEV_RAM_SIZE + int "Default RAM disk size" + depends on BLK_DEV_RAM + default "4096" + +config BLK_DEV_INITRD + bool "Initial RAM disk (initrd) support" + depends on BLK_DEV_RAM=y + +config MMAPPER + tristate "Example IO memory driver" + help + The User-Mode Linux port can provide support for IO Memory + emulation with this option. This allows a host file to be + specified as an I/O region on the kernel command line. That file + will be mapped into UML's kernel address space where a driver can + locate it and do whatever it wants with the memory, including + providing an interface to it for UML processes to use. + + For more information, see + . + + If you'd like to be able to provide a simulated IO port space for + User-Mode Linux processes, say Y. If unsure, say N. + +endmenu + diff -Nru a/arch/um/Kconfig_block~uml-summa.diff b/arch/um/Kconfig_block~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/Kconfig_block~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,68 @@ + +menu "Block Devices" + +config BLK_DEV_UBD + bool "Virtual block device" + help + The User-Mode Linux port includes a driver called UBD which will let + you access arbitrary files on the host computer as block devices. + Unless you know that you do not need such virtual block devices say + Y here. + +config BLK_DEV_UBD_SYNC + bool "Always do synchronous disk IO for UBD" + depends on BLK_DEV_UBD + help + Writes to the virtual block device are not immediately written to the + host's disk; this may cause problems if, for example, the + User-Mode Linux 'Virtual Machine' uses a journalling filesystem and + the host computer crashes. + + Synchronous operation (i.e. always writing data to the host's disk + immediately) is configurable on a per-UBD basis by using a special + kernel command line option. Alternatively, you can say Y here to + turn on synchronous operation by default for all block devices. + + If you're running a journalling file system (like reiserfs, for + example) in your virtual machine, you will want to say Y here. If + you care for the safety of the data in your virtual machine, Y is a + wise choice too. In all other cases (for example, if you're just + playing around with User-Mode Linux) you can choose N. + +config BLK_DEV_LOOP + tristate "Loopback device support" + +config BLK_DEV_NBD + tristate "Network block device support" + depends on NET + +config BLK_DEV_RAM + tristate "RAM disk support" + +config BLK_DEV_RAM_SIZE + int "Default RAM disk size" + depends on BLK_DEV_RAM + default "4096" + +config BLK_DEV_INITRD + bool "Initial RAM disk (initrd) support" + depends on BLK_DEV_RAM=y + +config MMAPPER + tristate "Example IO memory driver" + help + The User-Mode Linux port can provide support for IO Memory + emulation with this option. This allows a host file to be + specified as an I/O region on the kernel command line. That file + will be mapped into UML's kernel address space where a driver can + locate it and do whatever it wants with the memory, including + providing an interface to it for UML processes to use. + + For more information, see + . + + If you'd like to be able to provide a simulated IO port space for + User-Mode Linux processes, say Y. If unsure, say N. + +endmenu + diff -Nru a/arch/um/Kconfig_net b/arch/um/Kconfig_net --- a/arch/um/Kconfig_net Fri Oct 31 14:10:54 2003 +++ b/arch/um/Kconfig_net Fri Oct 31 14:10:54 2003 @@ -1,5 +1,5 @@ -menu "Network Devices" +menu "UML Network Devices" depends on NET # UML virtual driver @@ -175,74 +175,6 @@ don't need UML networking, say N. Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" - - -# Below are hardware-independent drivers mirrored from -# drivers/net/Config.in. It would be nice if Linux -# had HW independent drivers separated from the other -# but it does not. Until then each non-ISA/PCI arch -# needs to provide it's own menu of network drivers -config DUMMY - tristate "Dummy net driver support" - -config BONDING - tristate "Bonding driver support" - -config EQUALIZER - tristate "EQL (serial line load balancing) support" - -config TUN - tristate "Universal TUN/TAP device driver support" - -config ETHERTAP - tristate "Ethertap network tap (OBSOLETE)" - depends on EXPERIMENTAL && NETLINK - -config PPP - tristate "PPP (point-to-point protocol) support" - -config PPP_MULTILINK - bool "PPP multilink support (EXPERIMENTAL)" - depends on PPP && EXPERIMENTAL - -config PPP_FILTER - bool "PPP filtering" - depends on PPP && FILTER - -config PPP_ASYNC - tristate "PPP support for async serial ports" - depends on PPP - -config PPP_SYNC_TTY - tristate "PPP support for sync tty ports" - depends on PPP - -config PPP_DEFLATE - tristate "PPP Deflate compression" - depends on PPP - -config PPP_BSDCOMP - tristate "PPP BSD-Compress compression" - depends on PPP - -config PPPOE - tristate "PPP over Ethernet (EXPERIMENTAL)" - depends on PPP && EXPERIMENTAL - -config SLIP - tristate "SLIP (serial line) support" - -config SLIP_COMPRESSED - bool "CSLIP compressed headers" - depends on SLIP=y - -config SLIP_SMART - bool "Keepalive and linefill" - depends on SLIP=y - -config SLIP_MODE_SLIP6 - bool "Six bit SLIP encapsulation" - depends on SLIP=y endmenu diff -Nru a/arch/um/Kconfig_net~uml-summa.diff b/arch/um/Kconfig_net~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/Kconfig_net~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,248 @@ + +menu "Network Devices" + depends on NET + +# UML virtual driver +config UML_NET + bool "Virtual network device" + help + While the User-Mode port cannot directly talk to any physical + hardware devices, this choice and the following transport options + provide one or more virtual network devices through which the UML + kernels can talk to each other, the host, and with the host's help, + machines on the outside world. + + For more information, including explanations of the networking and + sample configurations, see + . + + If you'd like to be able to enable networking in the User-Mode + linux environment, say Y; otherwise say N. Note that you must + enable at least one of the following transport options to actually + make use of UML networking. + +config UML_NET_ETHERTAP + bool "Ethertap transport" + depends on UML_NET + help + The Ethertap User-Mode Linux network transport allows a single + running UML to exchange packets with its host over one of the + host's Ethertap devices, such as /dev/tap0. Additional running + UMLs can use additional Ethertap devices, one per running UML. + While the UML believes it's on a (multi-device, broadcast) virtual + Ethernet network, it's in fact communicating over a point-to-point + link with the host. + + To use this, your host kernel must have support for Ethertap + devices. Also, if your host kernel is 2.4.x, it must have + CONFIG_NETLINK_DEV configured as Y or M. + + For more information, see + That site + has examples of the UML command line to use to enable Ethertap + networking. + + If you'd like to set up an IP network with the host and/or the + outside world, say Y to this, the Daemon Transport and/or the + Slip Transport. You'll need at least one of them, but may choose + more than one without conflict. If you don't need UML networking, + say N. + +config UML_NET_TUNTAP + bool "TUN/TAP transport" + depends on UML_NET + help + The UML TUN/TAP network transport allows a UML instance to exchange + packets with the host over a TUN/TAP device. This option will only + work with a 2.4 host, unless you've applied the TUN/TAP patch to + your 2.2 host kernel. + + To use this transport, your host kernel must have support for TUN/TAP + devices, either built-in or as a module. + +config UML_NET_SLIP + bool "SLIP transport" + depends on UML_NET + help + The slip User-Mode Linux network transport allows a running UML to + network with its host over a point-to-point link. Unlike Ethertap, + which can carry any Ethernet frame (and hence even non-IP packets), + the slip transport can only carry IP packets. + + To use this, your host must support slip devices. + + For more information, see + . That site + has examples of the UML command line to use to enable slip + networking, and details of a few quirks with it. + + The Ethertap Transport is preferred over slip because of its + limitations. If you prefer slip, however, say Y here. Otherwise + choose the Multicast transport (to network multiple UMLs on + multiple hosts), Ethertap (to network with the host and the + outside world), and/or the Daemon transport (to network multiple + UMLs on a single host). You may choose more than one without + conflict. If you don't need UML networking, say N. + +config UML_NET_DAEMON + bool "Daemon transport" + depends on UML_NET + help + This User-Mode Linux network transport allows one or more running + UMLs on a single host to communicate with each other, but not to + the host. + + To use this form of networking, you'll need to run the UML + networking daemon on the host. + + For more information, see + That site + has examples of the UML command line to use to enable Daemon + networking. + + If you'd like to set up a network with other UMLs on a single host, + say Y. If you need a network between UMLs on multiple physical + hosts, choose the Multicast Transport. To set up a network with + the host and/or other IP machines, say Y to the Ethertap or Slip + transports. You'll need at least one of them, but may choose + more than one without conflict. If you don't need UML networking, + say N. + +config UML_NET_MCAST + bool "Multicast transport" + depends on UML_NET + help + This Multicast User-Mode Linux network transport allows multiple + UMLs (even ones running on different host machines!) to talk to + each other over a virtual ethernet network. However, it requires + at least one UML with one of the other transports to act as a + bridge if any of them need to be able to talk to their hosts or any + other IP machines. + + To use this, your host kernel(s) must support IP Multicasting. + + For more information, see + That site + has examples of the UML command line to use to enable Multicast + networking, and notes about the security of this approach. + + If you need UMLs on multiple physical hosts to communicate as if + they shared an Ethernet network, say Y. If you need to communicate + with other IP machines, make sure you select one of the other + transports (possibly in addition to Multicast; they're not + exclusive). If you don't need to network UMLs say N to each of + the transports. + +config UML_NET_PCAP + bool "pcap transport" + depends on UML_NET + help + The pcap transport makes a pcap packet stream on the host look + like an ethernet device inside UML. This is useful for making + UML act as a network monitor for the host. You must have libcap + installed in order to build the pcap transport into UML. + + For more information, see + That site + has examples of the UML command line to use to enable this option. + + If you intend to use UML as a network monitor for the host, say + Y here. Otherwise, say N. + +config UML_NET_SLIRP + bool "SLiRP transport" + depends on UML_NET + help + The SLiRP User-Mode Linux network transport allows a running UML + to network by invoking a program that can handle SLIP encapsulated + packets. This is commonly (but not limited to) the application + known as SLiRP, a program that can re-socket IP packets back onto + the host on which it is run. Only IP packets are supported, + unlike other network transports that can handle all Ethernet + frames. In general, slirp allows the UML the same IP connectivity + to the outside world that the host user is permitted, and unlike + other transports, SLiRP works without the need of root level + privleges, setuid binaries, or SLIP devices on the host. This + also means not every type of connection is possible, but most + situations can be accomodated with carefully crafted slirp + commands that can be passed along as part of the network device's + setup string. The effect of this transport on the UML is similar + that of a host behind a firewall that masquerades all network + connections passing through it (but is less secure). + + To use this you should first have slirp compiled somewhere + accessible on the host, and have read its documentation. If you + don't need UML networking, say N. + + Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" + + +# Below are hardware-independent drivers mirrored from +# drivers/net/Config.in. It would be nice if Linux +# had HW independent drivers separated from the other +# but it does not. Until then each non-ISA/PCI arch +# needs to provide it's own menu of network drivers +config DUMMY + tristate "Dummy net driver support" + +config BONDING + tristate "Bonding driver support" + +config EQUALIZER + tristate "EQL (serial line load balancing) support" + +config TUN + tristate "Universal TUN/TAP device driver support" + +config ETHERTAP + tristate "Ethertap network tap (OBSOLETE)" + depends on EXPERIMENTAL && NETLINK + +config PPP + tristate "PPP (point-to-point protocol) support" + +config PPP_MULTILINK + bool "PPP multilink support (EXPERIMENTAL)" + depends on PPP && EXPERIMENTAL + +config PPP_FILTER + bool "PPP filtering" + depends on PPP && FILTER + +config PPP_ASYNC + tristate "PPP support for async serial ports" + depends on PPP + +config PPP_SYNC_TTY + tristate "PPP support for sync tty ports" + depends on PPP + +config PPP_DEFLATE + tristate "PPP Deflate compression" + depends on PPP + +config PPP_BSDCOMP + tristate "PPP BSD-Compress compression" + depends on PPP + +config PPPOE + tristate "PPP over Ethernet (EXPERIMENTAL)" + depends on PPP && EXPERIMENTAL + +config SLIP + tristate "SLIP (serial line) support" + +config SLIP_COMPRESSED + bool "CSLIP compressed headers" + depends on SLIP=y + +config SLIP_SMART + bool "Keepalive and linefill" + depends on SLIP=y + +config SLIP_MODE_SLIP6 + bool "Six bit SLIP encapsulation" + depends on SLIP=y + +endmenu + diff -Nru a/arch/um/Kconfig~uml-summa.diff b/arch/um/Kconfig~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/Kconfig~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,270 @@ +config USERMODE + bool + default y + +# XXX: does UM have a mmu/swap? +config MMU + bool + default y + +mainmenu "Linux/Usermode Kernel Configuration" + +config ISA + bool + +config SBUS + bool + +config PCI + bool + +config UID16 + bool + default y + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +menu "UML-specific options" + +config MODE_TT + bool "Tracing thread support" + default y + help + This option controls whether tracing thread support is compiled + into UML. Normally, this should be set to Y. If you intend to + use only skas mode (and the host has the skas patch applied to it), + then it is OK to say N here. + +config STATIC_LINK + bool "Force a static link" + default n + depends on !MODE_TT + help + If CONFIG_MODE_TT is disabled, then this option gives you the ability + to force a static link of UML. Normally, if only skas mode is built + in to UML, it will be linked as a shared binary. This is inconvenient + for use in a chroot jail. So, if you intend to run UML inside a + chroot, and you disable CONFIG_MODE_TT, you probably want to say Y + here. + +config MODE_SKAS + bool "Separate Kernel Address Space support" + default y + help + This option controls whether skas (separate kernel address space) + support is compiled in. If you have applied the skas patch to the + host, then you certainly want to say Y here (and consider saying N + to CONFIG_MODE_TT). Otherwise, it is safe to say Y. Disabling this + option will shrink the UML binary slightly. + +config NET + bool "Networking support" + +source "fs/Kconfig.binfmt" + +config HOSTFS + tristate "Host filesystem" + help + While the User-Mode Linux port uses its own root file system for + booting and normal file access, this module lets the UML user + access files stored on the host. It does not require any + network connection between the Host and UML. An example use of + this might be: + + mount none /tmp/fromhost -t hostfs -o /tmp/umlshare + + where /tmp/fromhost is an empty directory inside UML and + /tmp/umlshare is a directory on the host with files the UML user + wishes to access. + + For more information, see + . + + If you'd like to be able to work with files stored on the host, + say Y or M here; otherwise say N. + + +config MCONSOLE + bool "Management console" + help + The user mode linux management console is a low-level interface to + the kernel, somewhat like the i386 SysRq interface. Since there is + a full-blown operating system running under every user mode linux + instance, there is much greater flexibility possible than with the + SysRq mechanism. + + If you answer 'Y' to this option, to use this feature, you need the + mconsole client (called uml_mconsole) which is present in CVS in + 2.4.5-9um and later (path /tools/mconsole), and is also in the + distribution RPM package in 2.4.6 and later. + + It is safe to say 'Y' here. + +config MAGIC_SYSRQ + bool "Magic SysRq key" + depends on MCONSOLE + +config HOST_2G_2G + bool "2G/2G host address space split" + +config UML_SMP + bool "Symmetric multi-processing support" + help + This option enables UML SMP support. UML implements virtual SMP by + allowing as many processes to run simultaneously on the host as + there are virtual processors configured. Obviously, if the host is + a uniprocessor, those processes will timeshare, but, inside UML, + will appear to be running simultaneously. If the host is a + multiprocessor, then UML processes may run simultaneously, depending + on the host scheduler. + CONFIG_SMP will be set to whatever this option is set to. + It is safe to leave this unchanged. + +config SMP + bool + default UML_SMP + +config NR_CPUS + int "Maximum number of CPUs (2-32)" + depends on SMP + default "32" + +config NEST_LEVEL + int "Nesting level" + default "0" + help + This is set to the number of layers of UMLs that this UML will be run + in. Normally, this is zero, meaning that it will run directly on the + host. Setting it to one will build a UML that can run inside a UML + that is running on the host. Generally, if you intend this UML to run + inside another UML, set CONFIG_NEST_LEVEL to one more than the host + UML. + + Note that if the hosting UML has its CONFIG_KERNEL_HALF_GIGS set to + greater than one, then the guest UML should have its CONFIG_NEST_LEVEL + set to the host's CONFIG_NEST_LEVEL + CONFIG_KERNEL_HALF_GIGS. + Only change this if you are running nested UMLs. + +config KERNEL_HALF_GIGS + int "Kernel address space size (in .5G units)" + default "1" + help + This determines the amount of address space that UML will allocate for + its own, measured in half Gigabyte units. The default is 1. + Change this only if you need to boot UML with an unusually large amount + of physical memory. + +config HIGHMEM + bool "Highmem support" + +config KERNEL_STACK_ORDER + int "Kernel stack size order" + default 2 + help + This option determines the size of UML kernel stacks. They will + be 1 << order pages. The default is OK unless you're running Valgrind + on UML, in which case, set this to 3. + +endmenu + +source "init/Kconfig" + +source "drivers/base/Kconfig" + +source "arch/um/Kconfig_char" + +source "arch/um/Kconfig_block" + +config NETDEVICES + bool + default NET + +source "arch/um/Kconfig_net" + +source "net/Kconfig" + +source "fs/Kconfig" + +source "security/Kconfig" + +source "crypto/Kconfig" + +source "lib/Kconfig" + +menu "SCSI support" + +config SCSI + tristate "SCSI support" + +# This gives us free_dma, which scsi.c wants. +config GENERIC_ISA_DMA + bool + depends on SCSI + default y + +source "arch/um/Kconfig_scsi" + +endmenu + +source "drivers/md/Kconfig" + +source "drivers/mtd/Kconfig" + + +menu "Kernel hacking" + +config DEBUG_SLAB + bool "Debug memory allocations" + +config DEBUG_SPINLOCK + bool "Debug spinlocks usage" + +config DEBUG_INFO + bool "Enable kernel debugging symbols" + help + When this is enabled, the User-Mode Linux binary will include + debugging symbols. This enlarges the binary by a few megabytes, + but aids in tracking down kernel problems in UML. It is required + if you intend to do any kernel development. + + If you're truly short on disk space or don't expect to report any + bugs back to the UML developers, say N, otherwise say Y. + +config FRAME_POINTER + bool + default y if DEBUG_INFO + +config PT_PROXY + bool "Enable ptrace proxy" + depends on XTERM_CHAN && DEBUG_INFO + +config GPROF + bool "Enable gprof support" + depends on DEBUG_INFO + help + This allows profiling of a User-Mode Linux kernel with the gprof + utility. + + See for more + details. + + If you're involved in UML kernel development and want to use gprof, + say Y. If you're unsure, say N. + +config GCOV + bool "Enable gcov support" + depends on DEBUG_INFO + help + This option allows developers to retrieve coverage data from a UML + session. + + See for more + details. + + If you're involved in UML kernel development and want to use gcov, + say Y. If you're unsure, say N. + +endmenu + diff -Nru a/arch/um/Makefile b/arch/um/Makefile --- a/arch/um/Makefile Fri Oct 31 14:10:54 2003 +++ b/arch/um/Makefile Fri Oct 31 14:10:54 2003 @@ -24,15 +24,17 @@ # Have to precede the include because the included Makefiles reference them. SYMLINK_HEADERS = include/asm-um/archparam.h include/asm-um/system.h \ include/asm-um/sigcontext.h include/asm-um/processor.h \ - include/asm-um/ptrace.h include/asm-um/arch-signal.h + include/asm-um/ptrace.h include/asm-um/arch-signal.h \ + include/asm-um/module.h ARCH_SYMLINKS = include/asm-um/arch $(ARCH_DIR)/include/sysdep $(ARCH_DIR)/os \ $(SYMLINK_HEADERS) $(ARCH_DIR)/include/uml-config.h GEN_HEADERS += $(ARCH_DIR)/include/task.h $(ARCH_DIR)/include/kern_constants.h -include $(ARCH_DIR)/Makefile-$(SUBARCH) -include $(ARCH_DIR)/Makefile-os-$(OS) +.PHONY: sys_prepare +sys_prepare: + @: MAKEFILE-$(CONFIG_MODE_TT) += Makefile-tt MAKEFILE-$(CONFIG_MODE_SKAS) += Makefile-skas @@ -41,6 +43,9 @@ include $(addprefix $(ARCH_DIR)/,$(MAKEFILE-y)) endif +include $(ARCH_DIR)/Makefile-$(SUBARCH) +include $(ARCH_DIR)/Makefile-os-$(OS) + EXTRAVERSION := $(EXTRAVERSION)-1um ARCH_INCLUDE = -I$(ARCH_DIR)/include @@ -52,14 +57,16 @@ CFLAGS += $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ -D_LARGEFILE64_SOURCE $(ARCH_INCLUDE) -Derrno=kernel_errno \ - $(MODE_INCLUDE) + -Dsigprocmask=kernel_sigprocmask $(MODE_INCLUDE) LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc +CONFIG_NEST_LEVEL ?= 0 +CONFIG_KERNEL_HALF_GIGS ?= 1 SIZE = (($(CONFIG_NEST_LEVEL) + $(CONFIG_KERNEL_HALF_GIGS)) * 0x20000000) ifeq ($(CONFIG_MODE_SKAS), y) -$(SYS_HEADERS) : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h +$(SYS_HEADERS) : $(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h endif include/linux/version.h: arch/$(ARCH)/Makefile @@ -98,17 +105,17 @@ CONFIG_KERNEL_STACK_ORDER ?= 2 STACK_SIZE := $(shell echo $$[ 4096 * (1 << $(CONFIG_KERNEL_STACK_ORDER)) ] ) -AFLAGS_vmlinux.lds.o = -U$(SUBARCH) \ +AFLAGS_vmlinux.lds.o = $(shell echo -U$(SUBARCH) \ -DSTART=$$(($(TOP_ADDR) - $(SIZE))) -DELF_ARCH=$(ELF_ARCH) \ -DELF_FORMAT=\"$(ELF_FORMAT)\" $(CPP_MODE_TT) \ - -DKERNEL_STACK_SIZE=$(STACK_SIZE) + -DKERNEL_STACK_SIZE=$(STACK_SIZE)) -AFLAGS_$(LD_SCRIPT-y:.s=).o = $(AFLAGS_vmlinux.lds.o) -P -C -Uum +export AFLAGS_$(LD_SCRIPT-y:.s=).o = $(AFLAGS_vmlinux.lds.o) -P -C -Uum LD_SCRIPT-y := $(ARCH_DIR)/$(LD_SCRIPT-y) -$(LD_SCRIPT-y) : $(LD_SCRIPT-y:.s=.S) scripts FORCE - $(call if_changed_dep,as_s_S) +#$(LD_SCRIPT-y) : $(LD_SCRIPT-y:.s=.S) scripts FORCE +# $(call if_changed_dep,as_s_S) linux: vmlinux $(LD_SCRIPT-y) $(CC) -Wl,-T,$(LD_SCRIPT-y) $(LINK-y) $(LINK_WRAPS) \ @@ -116,6 +123,7 @@ USER_CFLAGS := $(patsubst -I%,,$(CFLAGS)) USER_CFLAGS := $(patsubst -Derrno=kernel_errno,,$(USER_CFLAGS)) +USER_CFLAGS := $(patsubst -Dsigprocmask=kernel_sigprocmask,,$(USER_CFLAGS)) USER_CFLAGS := $(patsubst -D__KERNEL__,,$(USER_CFLAGS)) $(ARCH_INCLUDE) \ $(MODE_INCLUDE) @@ -123,9 +131,10 @@ USER_CFLAGS += -D_GNU_SOURCE CLEAN_FILES += linux x.i gmon.out $(ARCH_DIR)/uml.lds.s \ - $(ARCH_DIR)/dyn_link.ld.s $(GEN_HEADERS) + $(ARCH_DIR)/dyn_link.ld.s $(ARCH_DIR)/include/uml-config.h \ + $(GEN_HEADERS) -$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c +$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c sys_prepare $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $< archmrproper: @@ -161,19 +170,23 @@ $(ARCH_DIR)/os: cd $(ARCH_DIR) && ln -sf os-$(OS) os -$(ARCH_DIR)/include/uml-config.h : +$(ARCH_DIR)/include/uml-config.h : $(TOPDIR)/include/linux/autoconf.h sed 's/ CONFIG/ UML_CONFIG/' $(TOPDIR)/include/linux/autoconf.h > $@ +filechk_$(ARCH_DIR)/include/task.h := $(ARCH_DIR)/util/mk_task + $(ARCH_DIR)/include/task.h : $(ARCH_DIR)/util/mk_task - $< > $@ + $(call filechk,$@) + +filechk_$(ARCH_DIR)/include/kern_constants.h := $(ARCH_DIR)/util/mk_constants $(ARCH_DIR)/include/kern_constants.h : $(ARCH_DIR)/util/mk_constants - $< > $@ + $(call filechk,$@) -$(ARCH_DIR)/util/mk_task : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h \ - $(ARCH_DIR)/util FORCE ; +$(ARCH_DIR)/util/mk_task $(ARCH_DIR)/util/mk_constants : $(ARCH_DIR)/util \ + sys_prepare FORCE ; $(ARCH_DIR)/util: FORCE - @$(call descend,$@,) + $(MAKE) -f scripts/Makefile.build obj=$@ -export SUBARCH USER_CFLAGS OS +export SUBARCH USER_CFLAGS OS diff -Nru a/arch/um/Makefile-i386 b/arch/um/Makefile-i386 --- a/arch/um/Makefile-i386 Fri Oct 31 14:10:54 2003 +++ b/arch/um/Makefile-i386 Fri Oct 31 14:10:54 2003 @@ -16,22 +16,28 @@ SYS_HEADERS = $(SYS_DIR)/sc.h $(SYS_DIR)/thread.h +sys_prepare: $(SYS_DIR)/sc.h + prepare: $(SYS_HEADERS) +filechk_$(SYS_DIR)/sc.h := $(SYS_UTIL_DIR)/mk_sc + $(SYS_DIR)/sc.h: $(SYS_UTIL_DIR)/mk_sc - $< > $@ + $(call filechk,$@) + +filechk_$(SYS_DIR)/thread.h := $(SYS_UTIL_DIR)/mk_thread $(SYS_DIR)/thread.h: $(SYS_UTIL_DIR)/mk_thread - $< > $@ + $(call filechk,$@) -$(SYS_UTIL_DIR)/mk_sc: FORCE ; - @$(call descend,$(SYS_UTIL_DIR),$@) +$(SYS_UTIL_DIR)/mk_sc: scripts/fixdep include/config/MARKER FORCE ; + +@$(call descend,$(SYS_UTIL_DIR),$@) -$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) FORCE ; - @$(call descend,$(SYS_UTIL_DIR),$@) +$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) sys_prepare FORCE ; + +@$(call descend,$(SYS_UTIL_DIR),$@) $(SYS_UTIL_DIR): include/asm FORCE - @$(call descend,$@,) + +@$(call descend,$@,) sysclean : rm -f $(SYS_HEADERS) diff -Nru a/arch/um/Makefile-i386~uml-summa.diff b/arch/um/Makefile-i386~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/Makefile-i386~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,37 @@ +ifeq ($(CONFIG_HOST_2G_2G), y) +TOP_ADDR = 0x80000000 +else +TOP_ADDR = 0xc0000000 +endif + +CFLAGS += -U__$(SUBARCH)__ -U$(SUBARCH) +ELF_ARCH = $(SUBARCH) +ELF_FORMAT = elf32-$(SUBARCH) + +OBJCOPYFLAGS := -O binary -R .note -R .comment -S +LDFLAGS_BLOB := --format binary --oformat elf32-i386 + +SYS_DIR := $(ARCH_DIR)/include/sysdep-i386 +SYS_UTIL_DIR := $(ARCH_DIR)/sys-i386/util + +SYS_HEADERS = $(SYS_DIR)/sc.h $(SYS_DIR)/thread.h + +prepare: $(SYS_HEADERS) + +$(SYS_DIR)/sc.h: $(SYS_UTIL_DIR)/mk_sc + $< > $@ + +$(SYS_DIR)/thread.h: $(SYS_UTIL_DIR)/mk_thread + $< > $@ + +$(SYS_UTIL_DIR)/mk_sc: FORCE ; + @$(call descend,$(SYS_UTIL_DIR),$@) + +$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) FORCE ; + @$(call descend,$(SYS_UTIL_DIR),$@) + +$(SYS_UTIL_DIR): include/asm FORCE + @$(call descend,$@,) + +sysclean : + rm -f $(SYS_HEADERS) diff -Nru a/arch/um/Makefile-skas b/arch/um/Makefile-skas --- a/arch/um/Makefile-skas Fri Oct 31 14:10:53 2003 +++ b/arch/um/Makefile-skas Fri Oct 31 14:10:53 2003 @@ -14,7 +14,7 @@ LINK_SKAS = -Wl,-rpath,/lib LD_SCRIPT_SKAS = dyn.lds.s -GEN_HEADERS += $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h +GEN_HEADERS += $(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h -$(ARCH_DIR)/kernel/skas/include/skas_ptregs.h : - $(MAKE) -C $(ARCH_DIR)/kernel/skas include/skas_ptregs.h +$(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h : + $(call descend,$(ARCH_DIR)/kernel/skas,$@) diff -Nru a/arch/um/Makefile-skas~uml-summa.diff b/arch/um/Makefile-skas~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/Makefile-skas~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,20 @@ +# +# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +PROFILE += -pg + +CFLAGS-$(CONFIG_GCOV) += -fprofile-arcs -ftest-coverage +CFLAGS-$(CONFIG_GPROF) += $(PROFILE) +LINK-$(CONFIG_GPROF) += $(PROFILE) + +MODE_INCLUDE += -I$(TOPDIR)/$(ARCH_DIR)/kernel/skas/include + +LINK_SKAS = -Wl,-rpath,/lib +LD_SCRIPT_SKAS = dyn.lds.s + +GEN_HEADERS += $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h + +$(ARCH_DIR)/kernel/skas/include/skas_ptregs.h : + $(MAKE) -C $(ARCH_DIR)/kernel/skas include/skas_ptregs.h diff -Nru a/arch/um/Makefile~uml-summa.diff b/arch/um/Makefile~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/Makefile~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,179 @@ +# +# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +ARCH_DIR = arch/um +OS := $(shell uname -s) + +# Recalculate MODLIB to reflect the EXTRAVERSION changes (via KERNELRELEASE) +# The way the toplevel Makefile is written EXTRAVERSION is not supposed +# to be changed outside the toplevel Makefile, but recalculating MODLIB is +# a sufficient workaround until we no longer need architecture dependent +# EXTRAVERSION... +MODLIB := $(INSTALL_MOD_PATH)/lib/modules/$(KERNELRELEASE) + +ifeq ($(CONFIG_DEBUG_INFO),y) +CFLAGS := $(subst -fomit-frame-pointer,,$(CFLAGS)) +endif + +core-y += $(ARCH_DIR)/kernel/ \ + $(ARCH_DIR)/drivers/ \ + $(ARCH_DIR)/sys-$(SUBARCH)/ + +# Have to precede the include because the included Makefiles reference them. +SYMLINK_HEADERS = include/asm-um/archparam.h include/asm-um/system.h \ + include/asm-um/sigcontext.h include/asm-um/processor.h \ + include/asm-um/ptrace.h include/asm-um/arch-signal.h + +ARCH_SYMLINKS = include/asm-um/arch $(ARCH_DIR)/include/sysdep $(ARCH_DIR)/os \ + $(SYMLINK_HEADERS) $(ARCH_DIR)/include/uml-config.h + +GEN_HEADERS += $(ARCH_DIR)/include/task.h $(ARCH_DIR)/include/kern_constants.h + +include $(ARCH_DIR)/Makefile-$(SUBARCH) +include $(ARCH_DIR)/Makefile-os-$(OS) + +MAKEFILE-$(CONFIG_MODE_TT) += Makefile-tt +MAKEFILE-$(CONFIG_MODE_SKAS) += Makefile-skas + +ifneq ($(MAKEFILE-y),) + include $(addprefix $(ARCH_DIR)/,$(MAKEFILE-y)) +endif + +EXTRAVERSION := $(EXTRAVERSION)-1um + +ARCH_INCLUDE = -I$(ARCH_DIR)/include + +# -Derrno=kernel_errno - This turns all kernel references to errno into +# kernel_errno to separate them from the libc errno. This allows -fno-common +# in CFLAGS. Otherwise, it would cause ld to complain about the two different +# errnos. + +CFLAGS += $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ + -D_LARGEFILE64_SOURCE $(ARCH_INCLUDE) -Derrno=kernel_errno \ + $(MODE_INCLUDE) + +LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc + +SIZE = (($(CONFIG_NEST_LEVEL) + $(CONFIG_KERNEL_HALF_GIGS)) * 0x20000000) + +ifeq ($(CONFIG_MODE_SKAS), y) +$(SYS_HEADERS) : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h +endif + +include/linux/version.h: arch/$(ARCH)/Makefile + +$(ARCH_DIR)/vmlinux.lds.S : + touch $@ + +prepare: $(ARCH_SYMLINKS) $(SYS_HEADERS) $(GEN_HEADERS) + +LDFLAGS_vmlinux = -r + +vmlinux: $(ARCH_DIR)/main.o + +# These aren't in Makefile-tt because they are needed in the !CONFIG_MODE_TT + +# CONFIG_MODE_SKAS + CONFIG_STATIC_LINK case. + +LINK_TT = -static +LD_SCRIPT_TT := uml.lds.s + +ifeq ($(CONFIG_STATIC_LINK),y) + LINK-y += $(LINK_TT) + LD_SCRIPT-y := $(LD_SCRIPT_TT) +else +ifeq ($(CONFIG_MODE_TT),y) + LINK-y += $(LINK_TT) + LD_SCRIPT-y := $(LD_SCRIPT_TT) +else +ifeq ($(CONFIG_MODE_SKAS),y) + LINK-y += $(LINK_SKAS) + LD_SCRIPT-y := $(LD_SCRIPT_SKAS) +endif +endif +endif + +CPP_MODE_TT := $(shell [ "$(CONFIG_MODE_TT)" = "y" ] && echo -DMODE_TT) +CONFIG_KERNEL_STACK_ORDER ?= 2 +STACK_SIZE := $(shell echo $$[ 4096 * (1 << $(CONFIG_KERNEL_STACK_ORDER)) ] ) + +AFLAGS_vmlinux.lds.o = -U$(SUBARCH) \ + -DSTART=$$(($(TOP_ADDR) - $(SIZE))) -DELF_ARCH=$(ELF_ARCH) \ + -DELF_FORMAT=\"$(ELF_FORMAT)\" $(CPP_MODE_TT) \ + -DKERNEL_STACK_SIZE=$(STACK_SIZE) + +AFLAGS_$(LD_SCRIPT-y:.s=).o = $(AFLAGS_vmlinux.lds.o) -P -C -Uum + +LD_SCRIPT-y := $(ARCH_DIR)/$(LD_SCRIPT-y) + +$(LD_SCRIPT-y) : $(LD_SCRIPT-y:.s=.S) scripts FORCE + $(call if_changed_dep,as_s_S) + +linux: vmlinux $(LD_SCRIPT-y) + $(CC) -Wl,-T,$(LD_SCRIPT-y) $(LINK-y) $(LINK_WRAPS) \ + -o linux $(ARCH_DIR)/main.o vmlinux -L/usr/lib -lutil + +USER_CFLAGS := $(patsubst -I%,,$(CFLAGS)) +USER_CFLAGS := $(patsubst -Derrno=kernel_errno,,$(USER_CFLAGS)) +USER_CFLAGS := $(patsubst -D__KERNEL__,,$(USER_CFLAGS)) $(ARCH_INCLUDE) \ + $(MODE_INCLUDE) + +# To get a definition of F_SETSIG +USER_CFLAGS += -D_GNU_SOURCE + +CLEAN_FILES += linux x.i gmon.out $(ARCH_DIR)/uml.lds.s \ + $(ARCH_DIR)/dyn_link.ld.s $(GEN_HEADERS) + +$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c + $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $< + +archmrproper: + for d in $(ARCH_SUBDIRS) $(ARCH_DIR)/util; \ + do \ + $(MAKE) -C $$d archmrproper; \ + done + rm -f $(CLEAN_FILES) $(SYMLINK_HEADERS) $(ARCH_SYMLINKS) include/asm \ + $(addprefix $(ARCH_DIR)/kernel/,$(KERN_SYMLINKS)) + +archclean: sysclean + for d in $(ARCH_SUBDIRS) $(ARCH_DIR)/util; \ + do \ + $(MAKE) -C $$d clean; \ + done + find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ + -o -name '*.gcov' \) -type f -print | xargs rm -f + rm -f linux x.i gmon.out $(ARCH_DIR)/link.ld $(GEN_HEADERS) + +archdep: + for d in $(ARCH_SUBDIRS); do $(MAKE) -C $$d fastdep; done + +$(SYMLINK_HEADERS): + cd $(TOPDIR)/$(dir $@) ; \ + ln -sf $(basename $(notdir $@))-$(SUBARCH)$(suffix $@) $(notdir $@) + +include/asm-um/arch: + cd $(TOPDIR)/include/asm-um && ln -sf ../asm-$(SUBARCH) arch + +$(ARCH_DIR)/include/sysdep: + cd $(ARCH_DIR)/include && ln -sf sysdep-$(SUBARCH) sysdep + +$(ARCH_DIR)/os: + cd $(ARCH_DIR) && ln -sf os-$(OS) os + +$(ARCH_DIR)/include/uml-config.h : + sed 's/ CONFIG/ UML_CONFIG/' $(TOPDIR)/include/linux/autoconf.h > $@ + +$(ARCH_DIR)/include/task.h : $(ARCH_DIR)/util/mk_task + $< > $@ + +$(ARCH_DIR)/include/kern_constants.h : $(ARCH_DIR)/util/mk_constants + $< > $@ + +$(ARCH_DIR)/util/mk_task : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h \ + $(ARCH_DIR)/util FORCE ; + +$(ARCH_DIR)/util: FORCE + @$(call descend,$@,) + +export SUBARCH USER_CFLAGS OS diff -Nru a/arch/um/config.release b/arch/um/config.release --- a/arch/um/config.release Fri Oct 31 14:10:54 2003 +++ b/arch/um/config.release Fri Oct 31 14:10:54 2003 @@ -228,7 +228,6 @@ CONFIG_EXT2_FS=y CONFIG_SYSV_FS=m CONFIG_UDF_FS=m -# CONFIG_UDF_RW is not set CONFIG_UFS_FS=m # CONFIG_UFS_FS_WRITE is not set diff -Nru a/arch/um/config.release~uml-summa.diff b/arch/um/config.release~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/config.release~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,335 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_USERMODE=y +# CONFIG_ISA is not set +# CONFIG_SBUS is not set +# CONFIG_PCI is not set +CONFIG_UID16=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# General Setup +# +CONFIG_STDIO_CONSOLE=y +CONFIG_NET=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_BINFMT_AOUT=y +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=y +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=256 +CONFIG_SSL=y +CONFIG_HOSTFS=y +CONFIG_MCONSOLE=y +CONFIG_MAGIC_SYSRQ=y +# CONFIG_HOST_2G_2G is not set +# CONFIG_UML_SMP is not set +# CONFIG_SMP is not set +CONFIG_CON_ZERO_CHAN="fd:0,fd:1" +CONFIG_CON_CHAN="xterm" +CONFIG_SSL_CHAN="pty" +CONFIG_NEST_LEVEL=0 +CONFIG_KERNEL_HALF_GIGS=1 + +# +# Loadable module support +# +CONFIG_MODULES=y +CONFIG_KMOD=y + +# +# Devices +# +CONFIG_BLK_DEV_UBD=y +# CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_NBD=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_INITRD=y +# CONFIG_MMAPPER is not set +CONFIG_UML_SOUND=y +CONFIG_SOUND=y +CONFIG_HOSTAUDIO=y +# CONFIG_UML_WATCHDOG is not set +# CONFIG_TTY_LOG is not set +CONFIG_FD_CHAN=y +# CONFIG_NULL_CHAN is not set +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +# CONFIG_NETLINK_DEV is not set +# CONFIG_NETFILTER is not set +# CONFIG_FILTER is not set +CONFIG_UNIX=y +CONFIG_INET=y +# CONFIG_IP_MULTICAST is not set +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_IPV6 is not set +# CONFIG_KHTTPD is not set +# CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set + +# +# +# +# CONFIG_IPX is not set +# CONFIG_ATALK is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_LLC is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set + +# +# Network device support +# +CONFIG_UML_NET=y +CONFIG_UML_NET_ETHERTAP=y +CONFIG_UML_NET_TUNTAP=y +CONFIG_UML_NET_SLIP=y +CONFIG_UML_NET_DAEMON=y +CONFIG_UML_NET_MCAST=y +CONFIG_NETDEVICES=y + +# +# ARCnet devices +# +# CONFIG_ARCNET is not set +CONFIG_DUMMY=y +CONFIG_BONDING=m +CONFIG_EQUALIZER=m +CONFIG_TUN=y +# CONFIG_ETHERTAP is not set + +# +# Ethernet (10 or 100Mbit) +# +# CONFIG_NET_ETHERNET is not set + +# +# Ethernet (1000 Mbit) +# +# CONFIG_ACENIC is not set +# CONFIG_DL2K is not set +# CONFIG_MYRI_SBUS is not set +# CONFIG_NS83820 is not set +# CONFIG_HAMACHI is not set +# CONFIG_YELLOWFIN is not set +# CONFIG_SK98LIN is not set +# CONFIG_FDDI is not set +# CONFIG_HIPPI is not set +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_MULTILINK=y +# CONFIG_PPP_FILTER is not set +# CONFIG_PPP_ASYNC is not set +CONFIG_PPP_SYNC_TTY=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPPOE=m +CONFIG_SLIP=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +# CONFIG_SLIP_MODE_SLIP6 is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Token Ring devices +# +# CONFIG_TR is not set +# CONFIG_NET_FC is not set +# CONFIG_RCPCI is not set +CONFIG_SHAPER=m + +# +# Wan interfaces +# +# CONFIG_WAN is not set + +# +# File systems +# +CONFIG_QUOTA=y +CONFIG_AUTOFS_FS=m +CONFIG_AUTOFS4_FS=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +CONFIG_ADFS_FS=m +# CONFIG_ADFS_FS_RW is not set +CONFIG_AFFS_FS=m +CONFIG_HFS_FS=m +CONFIG_BFS_FS=m +CONFIG_EXT3_FS=y +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=y +CONFIG_UMSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_EFS_FS=m +# CONFIG_JFFS_FS is not set +# CONFIG_JFFS2_FS is not set +CONFIG_CRAMFS=m +CONFIG_TMPFS=y +CONFIG_RAMFS=m +CONFIG_ISO9660_FS=y +# CONFIG_JOLIET is not set +# CONFIG_ZISOFS is not set +CONFIG_MINIX_FS=m +CONFIG_VXFS_FS=m +# CONFIG_NTFS_FS is not set +# CONFIG_NTFS_RW is not set +CONFIG_HPFS_FS=m +CONFIG_PROC_FS=y +CONFIG_DEVFS_FS=y +CONFIG_DEVFS_MOUNT=y +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +CONFIG_QNX4FS_FS=m +# CONFIG_QNX4FS_RW is not set +CONFIG_ROMFS_FS=m +CONFIG_EXT2_FS=y +CONFIG_SYSV_FS=m +CONFIG_UDF_FS=m +# CONFIG_UDF_RW is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set + +# +# Network File Systems +# +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +# CONFIG_ROOT_NFS is not set +CONFIG_NFSD=y +CONFIG_NFSD_V3=y +CONFIG_SUNRPC=y +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +# CONFIG_SMB_FS is not set +# CONFIG_NCP_FS is not set +# CONFIG_NCPFS_PACKET_SIGNING is not set +# CONFIG_NCPFS_IOCTL_LOCKING is not set +# CONFIG_NCPFS_STRONG is not set +# CONFIG_NCPFS_NFS_NS is not set +# CONFIG_NCPFS_OS2_NS is not set +# CONFIG_NCPFS_SMALLDOS is not set +# CONFIG_NCPFS_NLS is not set +# CONFIG_NCPFS_EXTRAS is not set +# CONFIG_ZISOFS_FS is not set +CONFIG_ZLIB_FS_INFLATE=m + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y +# CONFIG_SMB_NLS is not set +CONFIG_NLS=y + +# +# Native Language Support +# +CONFIG_NLS_DEFAULT="iso8859-1" +# CONFIG_NLS_CODEPAGE_437 is not set +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +# CONFIG_NLS_ISO8859_1 is not set +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_UTF8 is not set + +# +# Multi-device support (RAID and LVM) +# +# CONFIG_MD is not set +# CONFIG_BLK_DEV_MD is not set +# CONFIG_MD_LINEAR is not set +# CONFIG_MD_RAID0 is not set +# CONFIG_MD_RAID1 is not set +# CONFIG_MD_RAID5 is not set +# CONFIG_MD_MULTIPATH is not set +# CONFIG_BLK_DEV_LVM is not set + +# +# Memory Technology Devices (MTD) +# +# CONFIG_MTD is not set + +# +# Kernel hacking +# +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_INFO is not set +# CONFIG_PT_PROXY is not set +# CONFIG_GPROF is not set +# CONFIG_GCOV is not set diff -Nru a/arch/um/defconfig b/arch/um/defconfig --- a/arch/um/defconfig Fri Oct 31 14:10:54 2003 +++ b/arch/um/defconfig Fri Oct 31 14:10:54 2003 @@ -3,29 +3,19 @@ # CONFIG_USERMODE=y CONFIG_MMU=y -CONFIG_SWAP=y CONFIG_UID16=y CONFIG_RWSEM_GENERIC_SPINLOCK=y -CONFIG_CONFIG_LOG_BUF_SHIFT=14 - -# -# Code maturity level options -# -CONFIG_EXPERIMENTAL=y # -# General Setup +# UML-specific options # CONFIG_MODE_TT=y CONFIG_MODE_SKAS=y CONFIG_NET=y -CONFIG_SYSVIPC=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_SYSCTL=y -CONFIG_BINFMT_AOUT=y CONFIG_BINFMT_ELF=y CONFIG_BINFMT_MISC=y CONFIG_HOSTFS=y +CONFIG_HPPFS=y CONFIG_MCONSOLE=y CONFIG_MAGIC_SYSRQ=y # CONFIG_HOST_2G_2G is not set @@ -38,10 +28,38 @@ CONFIG_KERNEL_STACK_ORDER=2 # +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y +CONFIG_CLEAN_COMPILE=y +CONFIG_STANDALONE=y +CONFIG_BROKEN_ON_SMP=y + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_IKCONFIG is not set +# CONFIG_EMBEDDED is not set +CONFIG_KALLSYMS=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y + +# # Loadable module support # -CONFIG_MODULES=y -# CONFIG_KMOD is not set +# CONFIG_MODULES is not set + +# +# Generic Driver Options +# # # Character Devices @@ -69,6 +87,7 @@ # CONFIG_BLK_DEV_UBD=y # CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_COW_COMMON=n CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_NBD=y CONFIG_BLK_DEV_RAM=y @@ -78,7 +97,7 @@ CONFIG_NETDEVICES=y # -# Network Devices +# UML Network Devices # CONFIG_UML_NET=y CONFIG_UML_NET_ETHERTAP=y @@ -88,22 +107,6 @@ CONFIG_UML_NET_MCAST=y # CONFIG_UML_NET_PCAP is not set CONFIG_UML_NET_SLIRP=y -CONFIG_DUMMY=y -# CONFIG_BONDING is not set -# CONFIG_EQUALIZER is not set -CONFIG_TUN=y -# CONFIG_ETHERTAP is not set -CONFIG_PPP=y -# CONFIG_PPP_MULTILINK is not set -# CONFIG_PPP_ASYNC is not set -# CONFIG_PPP_SYNC_TTY is not set -# CONFIG_PPP_DEFLATE is not set -# CONFIG_PPP_BSDCOMP is not set -# CONFIG_PPPOE is not set -CONFIG_SLIP=y -# CONFIG_SLIP_COMPRESSED is not set -# CONFIG_SLIP_SMART is not set -# CONFIG_SLIP_MODE_SLIP6 is not set # # Networking support @@ -115,8 +118,6 @@ CONFIG_PACKET=y CONFIG_PACKET_MMAP=y # CONFIG_NETLINK_DEV is not set -# CONFIG_NETFILTER is not set -# CONFIG_FILTER is not set CONFIG_UNIX=y # CONFIG_NET_KEY is not set CONFIG_INET=y @@ -130,8 +131,11 @@ # CONFIG_SYN_COOKIES is not set # CONFIG_INET_AH is not set # CONFIG_INET_ESP is not set -# CONFIG_XFRM_USER is not set +# CONFIG_INET_IPCOMP is not set # CONFIG_IPV6 is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_NETFILTER is not set # # SCTP Configuration (EXPERIMENTAL) @@ -141,8 +145,6 @@ # CONFIG_ATM is not set # CONFIG_VLAN_8021Q is not set # CONFIG_LLC is not set -# CONFIG_DECNET is not set -# CONFIG_BRIDGE is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set # CONFIG_NET_DIVERT is not set @@ -160,6 +162,10 @@ # Network testing # # CONFIG_NET_PKTGEN is not set +CONFIG_DUMMY=y +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +CONFIG_TUN=y # # Ethernet (10 or 100Mbit) @@ -171,6 +177,22 @@ # # +# Ethernet (10000 Mbit) +# +CONFIG_PPP=y +# CONFIG_PPP_MULTILINK is not set +# CONFIG_PPP_FILTER is not set +# CONFIG_PPP_ASYNC is not set +# CONFIG_PPP_SYNC_TTY is not set +# CONFIG_PPP_DEFLATE is not set +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_SLIP=y +# CONFIG_SLIP_COMPRESSED is not set +# CONFIG_SLIP_SMART is not set +# CONFIG_SLIP_MODE_SLIP6 is not set + +# # Wireless LAN (non-hamradio) # # CONFIG_NET_RADIO is not set @@ -188,66 +210,82 @@ # # File systems # +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +# CONFIG_EXT3_FS is not set +# CONFIG_JBD is not set +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +# CONFIG_JFS_FS is not set +# CONFIG_XFS_FS is not set +CONFIG_MINIX_FS=y +# CONFIG_ROMFS_FS is not set CONFIG_QUOTA=y # CONFIG_QFMT_V1 is not set # CONFIG_QFMT_V2 is not set CONFIG_QUOTACTL=y -CONFIG_AUTOFS_FS=m -CONFIG_AUTOFS4_FS=m -CONFIG_REISERFS_FS=m -# CONFIG_REISERFS_CHECK is not set -# CONFIG_REISERFS_PROC_INFO is not set +CONFIG_AUTOFS_FS=y +CONFIG_AUTOFS4_FS=y + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +# CONFIG_JOLIET is not set +# CONFIG_ZISOFS is not set +# CONFIG_UDF_FS is not set + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_DEVFS_FS=y +CONFIG_DEVFS_MOUNT=y +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_DEVPTS_FS_XATTR is not set +# CONFIG_TMPFS is not set +CONFIG_RAMFS=y + +# +# Miscellaneous filesystems +# # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set # CONFIG_HFS_FS is not set # CONFIG_BEFS_FS is not set # CONFIG_BFS_FS is not set -# CONFIG_EXT3_FS is not set -# CONFIG_JBD is not set -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m # CONFIG_EFS_FS is not set CONFIG_JFFS_FS=y CONFIG_JFFS_FS_VERBOSE=0 -CONFIG_JFFS_PROC_FS=y # CONFIG_JFFS2_FS is not set # CONFIG_CRAMFS is not set -# CONFIG_TMPFS is not set -CONFIG_RAMFS=y -CONFIG_ISO9660_FS=m -# CONFIG_JOLIET is not set -# CONFIG_ZISOFS is not set -# CONFIG_JFS_FS is not set -CONFIG_MINIX_FS=m # CONFIG_VXFS_FS is not set -# CONFIG_NTFS_FS is not set # CONFIG_HPFS_FS is not set -CONFIG_PROC_FS=y -CONFIG_DEVFS_FS=y -CONFIG_DEVFS_MOUNT=y -# CONFIG_DEVFS_DEBUG is not set -CONFIG_DEVPTS_FS=y # CONFIG_QNX4FS_FS is not set -# CONFIG_ROMFS_FS is not set -CONFIG_EXT2_FS=y -# CONFIG_EXT2_FS_XATTR is not set # CONFIG_SYSV_FS is not set -# CONFIG_UDF_FS is not set # CONFIG_UFS_FS is not set -# CONFIG_XFS_FS is not set # # Network File Systems # -# CONFIG_CODA_FS is not set -# CONFIG_INTERMEZZO_FS is not set # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set # CONFIG_EXPORTFS is not set -# CONFIG_CIFS is not set # CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set # CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set # CONFIG_AFS_FS is not set # @@ -317,28 +355,7 @@ # # SCSI support # -CONFIG_SCSI=y -CONFIG_GENERIC_ISA_DMA=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -CONFIG_SD_EXTRA_DEVS=40 -CONFIG_CHR_DEV_ST=y -CONFIG_BLK_DEV_SR=y -CONFIG_BLK_DEV_SR_VENDOR=y -CONFIG_SR_EXTRA_DEVS=2 -CONFIG_CHR_DEV_SG=y - -# -# Some SCSI devices (e.g. CD jukebox) support multiple LUNs -# -CONFIG_SCSI_DEBUG_QUEUES=y -CONFIG_SCSI_MULTI_LUN=y -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_DEBUG=y +# CONFIG_SCSI is not set # # Multi-device support (RAID and LVM) @@ -360,6 +377,7 @@ CONFIG_MTD_BLOCK=y # CONFIG_FTL is not set # CONFIG_NFTL is not set +# CONFIG_INFTL is not set # # RAM/ROM/Flash chip drivers @@ -374,20 +392,21 @@ # # Mapping drivers for chip access # +# CONFIG_MTD_COMPLEX_MAPPINGS is not set # # Self-contained MTD device drivers # # CONFIG_MTD_SLRAM is not set # CONFIG_MTD_MTDRAM is not set -CONFIG_MTD_BLKMTD=m +CONFIG_MTD_BLKMTD=y # # Disk-On-Chip Device Drivers # -# CONFIG_MTD_DOC1000 is not set # CONFIG_MTD_DOC2000 is not set # CONFIG_MTD_DOC2001 is not set +# CONFIG_MTD_DOC2001PLUS is not set # # NAND Flash Device Drivers diff -Nru a/arch/um/defconfig~uml-kill-cow.diff b/arch/um/defconfig~uml-kill-cow.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/defconfig~uml-kill-cow.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,425 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_USERMODE=y +CONFIG_MMU=y +CONFIG_UID16=y +CONFIG_RWSEM_GENERIC_SPINLOCK=y + +# +# UML-specific options +# +CONFIG_MODE_TT=y +CONFIG_MODE_SKAS=y +CONFIG_NET=y +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=y +CONFIG_HOSTFS=y +CONFIG_HPPFS=y +CONFIG_MCONSOLE=y +CONFIG_MAGIC_SYSRQ=y +# CONFIG_HOST_2G_2G is not set +# CONFIG_UML_SMP is not set +# CONFIG_SMP is not set +CONFIG_NEST_LEVEL=0 +CONFIG_KERNEL_HALF_GIGS=1 +# CONFIG_HIGHMEM is not set +CONFIG_PROC_MM=y +CONFIG_KERNEL_STACK_ORDER=2 + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y +CONFIG_CLEAN_COMPILE=y +CONFIG_STANDALONE=y +CONFIG_BROKEN_ON_SMP=y + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_IKCONFIG is not set +# CONFIG_EMBEDDED is not set +CONFIG_KALLSYMS=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y + +# +# Loadable module support +# +# CONFIG_MODULES is not set + +# +# Generic Driver Options +# + +# +# Character Devices +# +CONFIG_STDIO_CONSOLE=y +CONFIG_SSL=y +CONFIG_FD_CHAN=y +CONFIG_NULL_CHAN=y +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y +CONFIG_CON_ZERO_CHAN="fd:0,fd:1" +CONFIG_CON_CHAN="xterm" +CONFIG_SSL_CHAN="pty" +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=256 +# CONFIG_WATCHDOG is not set +CONFIG_UML_SOUND=y +CONFIG_SOUND=y +CONFIG_HOSTAUDIO=y + +# +# Block Devices +# +CONFIG_BLK_DEV_UBD=y +# CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_COW_COMMON=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_NBD=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_INITRD=y +# CONFIG_MMAPPER is not set +CONFIG_NETDEVICES=y + +# +# UML Network Devices +# +CONFIG_UML_NET=y +CONFIG_UML_NET_ETHERTAP=y +CONFIG_UML_NET_TUNTAP=y +CONFIG_UML_NET_SLIP=y +CONFIG_UML_NET_DAEMON=y +CONFIG_UML_NET_MCAST=y +# CONFIG_UML_NET_PCAP is not set +CONFIG_UML_NET_SLIRP=y + +# +# Networking support +# + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +# CONFIG_NETLINK_DEV is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +# CONFIG_IP_MULTICAST is not set +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set +# CONFIG_IPV6 is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_NETFILTER is not set + +# +# SCTP Configuration (EXPERIMENTAL) +# +CONFIG_IPV6_SCTP__=y +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_LLC is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +CONFIG_DUMMY=y +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +CONFIG_TUN=y + +# +# Ethernet (10 or 100Mbit) +# +# CONFIG_NET_ETHERNET is not set + +# +# Ethernet (1000 Mbit) +# + +# +# Ethernet (10000 Mbit) +# +CONFIG_PPP=y +# CONFIG_PPP_MULTILINK is not set +# CONFIG_PPP_FILTER is not set +# CONFIG_PPP_ASYNC is not set +# CONFIG_PPP_SYNC_TTY is not set +# CONFIG_PPP_DEFLATE is not set +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_SLIP=y +# CONFIG_SLIP_COMPRESSED is not set +# CONFIG_SLIP_SMART is not set +# CONFIG_SLIP_MODE_SLIP6 is not set + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Token Ring devices (depends on LLC=y) +# +# CONFIG_SHAPER is not set + +# +# Wan interfaces +# +# CONFIG_WAN is not set + +# +# File systems +# +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +# CONFIG_EXT3_FS is not set +# CONFIG_JBD is not set +CONFIG_REISERFS_FS=y +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +# CONFIG_JFS_FS is not set +# CONFIG_XFS_FS is not set +CONFIG_MINIX_FS=y +# CONFIG_ROMFS_FS is not set +CONFIG_QUOTA=y +# CONFIG_QFMT_V1 is not set +# CONFIG_QFMT_V2 is not set +CONFIG_QUOTACTL=y +CONFIG_AUTOFS_FS=y +CONFIG_AUTOFS4_FS=y + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +# CONFIG_JOLIET is not set +# CONFIG_ZISOFS is not set +# CONFIG_UDF_FS is not set + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_DEVFS_FS=y +CONFIG_DEVFS_MOUNT=y +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_DEVPTS_FS_XATTR is not set +# CONFIG_TMPFS is not set +CONFIG_RAMFS=y + +# +# Miscellaneous filesystems +# +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_JFFS_FS=y +CONFIG_JFFS_FS_VERBOSE=0 +# CONFIG_JFFS2_FS is not set +# CONFIG_CRAMFS is not set +# CONFIG_VXFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UFS_FS is not set + +# +# Network File Systems +# +# CONFIG_NFS_FS is not set +# CONFIG_NFSD is not set +# CONFIG_EXPORTFS is not set +# CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y +CONFIG_NLS=y + +# +# Native Language Support +# +CONFIG_NLS_DEFAULT="iso8859-1" +# CONFIG_NLS_CODEPAGE_437 is not set +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +# CONFIG_NLS_ISO8859_1 is not set +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_UTF8 is not set + +# +# Security options +# +# CONFIG_SECURITY is not set + +# +# Cryptographic options +# +# CONFIG_CRYPTO is not set + +# +# Library routines +# +# CONFIG_CRC32 is not set + +# +# SCSI support +# +# CONFIG_SCSI is not set + +# +# Multi-device support (RAID and LVM) +# +# CONFIG_MD is not set + +# +# Memory Technology Devices (MTD) +# +CONFIG_MTD=y +# CONFIG_MTD_DEBUG is not set +# CONFIG_MTD_PARTITIONS is not set +# CONFIG_MTD_CONCAT is not set + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=y +CONFIG_MTD_BLOCK=y +# CONFIG_FTL is not set +# CONFIG_NFTL is not set +# CONFIG_INFTL is not set + +# +# RAM/ROM/Flash chip drivers +# +# CONFIG_MTD_CFI is not set +# CONFIG_MTD_JEDECPROBE is not set +# CONFIG_MTD_RAM is not set +# CONFIG_MTD_ROM is not set +# CONFIG_MTD_ABSENT is not set +# CONFIG_MTD_OBSOLETE_CHIPS is not set + +# +# Mapping drivers for chip access +# +# CONFIG_MTD_COMPLEX_MAPPINGS is not set + +# +# Self-contained MTD device drivers +# +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_MTDRAM is not set +CONFIG_MTD_BLKMTD=y + +# +# Disk-On-Chip Device Drivers +# +# CONFIG_MTD_DOC2000 is not set +# CONFIG_MTD_DOC2001 is not set +# CONFIG_MTD_DOC2001PLUS is not set + +# +# NAND Flash Device Drivers +# +# CONFIG_MTD_NAND is not set + +# +# Kernel hacking +# +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_SPINLOCK is not set +CONFIG_DEBUG_INFO=y +CONFIG_FRAME_POINTER=y +CONFIG_PT_PROXY=y +# CONFIG_GPROF is not set +# CONFIG_GCOV is not set diff -Nru a/arch/um/defconfig~uml-summa.diff b/arch/um/defconfig~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/defconfig~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,406 @@ +# +# Automatically generated make config: don't edit +# +CONFIG_USERMODE=y +CONFIG_MMU=y +CONFIG_SWAP=y +CONFIG_UID16=y +CONFIG_RWSEM_GENERIC_SPINLOCK=y +CONFIG_CONFIG_LOG_BUF_SHIFT=14 + +# +# Code maturity level options +# +CONFIG_EXPERIMENTAL=y + +# +# General Setup +# +CONFIG_MODE_TT=y +CONFIG_MODE_SKAS=y +CONFIG_NET=y +CONFIG_SYSVIPC=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_SYSCTL=y +CONFIG_BINFMT_AOUT=y +CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_MISC=y +CONFIG_HOSTFS=y +CONFIG_MCONSOLE=y +CONFIG_MAGIC_SYSRQ=y +# CONFIG_HOST_2G_2G is not set +# CONFIG_UML_SMP is not set +# CONFIG_SMP is not set +CONFIG_NEST_LEVEL=0 +CONFIG_KERNEL_HALF_GIGS=1 +# CONFIG_HIGHMEM is not set +CONFIG_PROC_MM=y +CONFIG_KERNEL_STACK_ORDER=2 + +# +# Loadable module support +# +CONFIG_MODULES=y +# CONFIG_KMOD is not set + +# +# Character Devices +# +CONFIG_STDIO_CONSOLE=y +CONFIG_SSL=y +CONFIG_FD_CHAN=y +CONFIG_NULL_CHAN=y +CONFIG_PORT_CHAN=y +CONFIG_PTY_CHAN=y +CONFIG_TTY_CHAN=y +CONFIG_XTERM_CHAN=y +CONFIG_CON_ZERO_CHAN="fd:0,fd:1" +CONFIG_CON_CHAN="xterm" +CONFIG_SSL_CHAN="pty" +CONFIG_UNIX98_PTYS=y +CONFIG_UNIX98_PTY_COUNT=256 +# CONFIG_WATCHDOG is not set +CONFIG_UML_SOUND=y +CONFIG_SOUND=y +CONFIG_HOSTAUDIO=y + +# +# Block Devices +# +CONFIG_BLK_DEV_UBD=y +# CONFIG_BLK_DEV_UBD_SYNC is not set +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_NBD=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_INITRD=y +# CONFIG_MMAPPER is not set +CONFIG_NETDEVICES=y + +# +# Network Devices +# +CONFIG_UML_NET=y +CONFIG_UML_NET_ETHERTAP=y +CONFIG_UML_NET_TUNTAP=y +CONFIG_UML_NET_SLIP=y +CONFIG_UML_NET_DAEMON=y +CONFIG_UML_NET_MCAST=y +# CONFIG_UML_NET_PCAP is not set +CONFIG_UML_NET_SLIRP=y +CONFIG_DUMMY=y +# CONFIG_BONDING is not set +# CONFIG_EQUALIZER is not set +CONFIG_TUN=y +# CONFIG_ETHERTAP is not set +CONFIG_PPP=y +# CONFIG_PPP_MULTILINK is not set +# CONFIG_PPP_ASYNC is not set +# CONFIG_PPP_SYNC_TTY is not set +# CONFIG_PPP_DEFLATE is not set +# CONFIG_PPP_BSDCOMP is not set +# CONFIG_PPPOE is not set +CONFIG_SLIP=y +# CONFIG_SLIP_COMPRESSED is not set +# CONFIG_SLIP_SMART is not set +# CONFIG_SLIP_MODE_SLIP6 is not set + +# +# Networking support +# + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +# CONFIG_NETLINK_DEV is not set +# CONFIG_NETFILTER is not set +# CONFIG_FILTER is not set +CONFIG_UNIX=y +# CONFIG_NET_KEY is not set +CONFIG_INET=y +# CONFIG_IP_MULTICAST is not set +# CONFIG_IP_ADVANCED_ROUTER is not set +# CONFIG_IP_PNP is not set +# CONFIG_NET_IPIP is not set +# CONFIG_NET_IPGRE is not set +# CONFIG_ARPD is not set +# CONFIG_INET_ECN is not set +# CONFIG_SYN_COOKIES is not set +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_XFRM_USER is not set +# CONFIG_IPV6 is not set + +# +# SCTP Configuration (EXPERIMENTAL) +# +CONFIG_IPV6_SCTP__=y +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_LLC is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +# CONFIG_NET_DIVERT is not set +# CONFIG_ECONET is not set +# CONFIG_WAN_ROUTER is not set +# CONFIG_NET_FASTROUTE is not set +# CONFIG_NET_HW_FLOWCONTROL is not set + +# +# QoS and/or fair queueing +# +# CONFIG_NET_SCHED is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set + +# +# Ethernet (10 or 100Mbit) +# +# CONFIG_NET_ETHERNET is not set + +# +# Ethernet (1000 Mbit) +# + +# +# Wireless LAN (non-hamradio) +# +# CONFIG_NET_RADIO is not set + +# +# Token Ring devices (depends on LLC=y) +# +# CONFIG_SHAPER is not set + +# +# Wan interfaces +# +# CONFIG_WAN is not set + +# +# File systems +# +CONFIG_QUOTA=y +# CONFIG_QFMT_V1 is not set +# CONFIG_QFMT_V2 is not set +CONFIG_QUOTACTL=y +CONFIG_AUTOFS_FS=m +CONFIG_AUTOFS4_FS=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EXT3_FS is not set +# CONFIG_JBD is not set +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +# CONFIG_EFS_FS is not set +CONFIG_JFFS_FS=y +CONFIG_JFFS_FS_VERBOSE=0 +CONFIG_JFFS_PROC_FS=y +# CONFIG_JFFS2_FS is not set +# CONFIG_CRAMFS is not set +# CONFIG_TMPFS is not set +CONFIG_RAMFS=y +CONFIG_ISO9660_FS=m +# CONFIG_JOLIET is not set +# CONFIG_ZISOFS is not set +# CONFIG_JFS_FS is not set +CONFIG_MINIX_FS=m +# CONFIG_VXFS_FS is not set +# CONFIG_NTFS_FS is not set +# CONFIG_HPFS_FS is not set +CONFIG_PROC_FS=y +CONFIG_DEVFS_FS=y +CONFIG_DEVFS_MOUNT=y +# CONFIG_DEVFS_DEBUG is not set +CONFIG_DEVPTS_FS=y +# CONFIG_QNX4FS_FS is not set +# CONFIG_ROMFS_FS is not set +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UDF_FS is not set +# CONFIG_UFS_FS is not set +# CONFIG_XFS_FS is not set + +# +# Network File Systems +# +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +# CONFIG_NFS_FS is not set +# CONFIG_NFSD is not set +# CONFIG_EXPORTFS is not set +# CONFIG_CIFS is not set +# CONFIG_SMB_FS is not set +# CONFIG_NCP_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y +CONFIG_NLS=y + +# +# Native Language Support +# +CONFIG_NLS_DEFAULT="iso8859-1" +# CONFIG_NLS_CODEPAGE_437 is not set +# CONFIG_NLS_CODEPAGE_737 is not set +# CONFIG_NLS_CODEPAGE_775 is not set +# CONFIG_NLS_CODEPAGE_850 is not set +# CONFIG_NLS_CODEPAGE_852 is not set +# CONFIG_NLS_CODEPAGE_855 is not set +# CONFIG_NLS_CODEPAGE_857 is not set +# CONFIG_NLS_CODEPAGE_860 is not set +# CONFIG_NLS_CODEPAGE_861 is not set +# CONFIG_NLS_CODEPAGE_862 is not set +# CONFIG_NLS_CODEPAGE_863 is not set +# CONFIG_NLS_CODEPAGE_864 is not set +# CONFIG_NLS_CODEPAGE_865 is not set +# CONFIG_NLS_CODEPAGE_866 is not set +# CONFIG_NLS_CODEPAGE_869 is not set +# CONFIG_NLS_CODEPAGE_936 is not set +# CONFIG_NLS_CODEPAGE_950 is not set +# CONFIG_NLS_CODEPAGE_932 is not set +# CONFIG_NLS_CODEPAGE_949 is not set +# CONFIG_NLS_CODEPAGE_874 is not set +# CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set +# CONFIG_NLS_CODEPAGE_1251 is not set +# CONFIG_NLS_ISO8859_1 is not set +# CONFIG_NLS_ISO8859_2 is not set +# CONFIG_NLS_ISO8859_3 is not set +# CONFIG_NLS_ISO8859_4 is not set +# CONFIG_NLS_ISO8859_5 is not set +# CONFIG_NLS_ISO8859_6 is not set +# CONFIG_NLS_ISO8859_7 is not set +# CONFIG_NLS_ISO8859_9 is not set +# CONFIG_NLS_ISO8859_13 is not set +# CONFIG_NLS_ISO8859_14 is not set +# CONFIG_NLS_ISO8859_15 is not set +# CONFIG_NLS_KOI8_R is not set +# CONFIG_NLS_KOI8_U is not set +# CONFIG_NLS_UTF8 is not set + +# +# Security options +# +# CONFIG_SECURITY is not set + +# +# Cryptographic options +# +# CONFIG_CRYPTO is not set + +# +# Library routines +# +# CONFIG_CRC32 is not set + +# +# SCSI support +# +CONFIG_SCSI=y +CONFIG_GENERIC_ISA_DMA=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +CONFIG_SD_EXTRA_DEVS=40 +CONFIG_CHR_DEV_ST=y +CONFIG_BLK_DEV_SR=y +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_SR_EXTRA_DEVS=2 +CONFIG_CHR_DEV_SG=y + +# +# Some SCSI devices (e.g. CD jukebox) support multiple LUNs +# +CONFIG_SCSI_DEBUG_QUEUES=y +CONFIG_SCSI_MULTI_LUN=y +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_DEBUG=y + +# +# Multi-device support (RAID and LVM) +# +# CONFIG_MD is not set + +# +# Memory Technology Devices (MTD) +# +CONFIG_MTD=y +# CONFIG_MTD_DEBUG is not set +# CONFIG_MTD_PARTITIONS is not set +# CONFIG_MTD_CONCAT is not set + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=y +CONFIG_MTD_BLOCK=y +# CONFIG_FTL is not set +# CONFIG_NFTL is not set + +# +# RAM/ROM/Flash chip drivers +# +# CONFIG_MTD_CFI is not set +# CONFIG_MTD_JEDECPROBE is not set +# CONFIG_MTD_RAM is not set +# CONFIG_MTD_ROM is not set +# CONFIG_MTD_ABSENT is not set +# CONFIG_MTD_OBSOLETE_CHIPS is not set + +# +# Mapping drivers for chip access +# + +# +# Self-contained MTD device drivers +# +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_MTDRAM is not set +CONFIG_MTD_BLKMTD=m + +# +# Disk-On-Chip Device Drivers +# +# CONFIG_MTD_DOC1000 is not set +# CONFIG_MTD_DOC2000 is not set +# CONFIG_MTD_DOC2001 is not set + +# +# NAND Flash Device Drivers +# +# CONFIG_MTD_NAND is not set + +# +# Kernel hacking +# +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_SPINLOCK is not set +CONFIG_DEBUG_INFO=y +CONFIG_FRAME_POINTER=y +CONFIG_PT_PROXY=y +# CONFIG_GPROF is not set +# CONFIG_GCOV is not set diff -Nru a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile --- a/arch/um/drivers/Makefile Fri Oct 31 14:10:54 2003 +++ b/arch/um/drivers/Makefile Fri Oct 31 14:10:54 2003 @@ -1,5 +1,5 @@ # -# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) +# Copyright (C) 2000, 2002, 2003 Jeff Dike (jdike@karaya.com) # Licensed under the GPL # @@ -39,6 +39,8 @@ obj-$(CONFIG_TTY_CHAN) += tty.o obj-$(CONFIG_XTERM_CHAN) += xterm.o xterm_kern.o obj-$(CONFIG_UML_WATCHDOG) += harddog.o +obj-$(CONFIG_BLK_DEV_COW) += cow_kern.o +obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o obj-y += stdio_console.o $(CHAN_OBJS) @@ -46,7 +48,7 @@ USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) fd.o \ null.o pty.o tty.o xterm.o -USER_OBJS := $(foreach file,$(USER_OBJS),arch/um/drivers/$(file)) +USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) $(USER_OBJS) : %.o: %.c $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< diff -Nru a/arch/um/drivers/Makefile~uml-summa.diff b/arch/um/drivers/Makefile~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/Makefile~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,63 @@ +# +# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +CHAN_OBJS := chan_kern.o chan_user.o line.o + +# pcap is broken in 2.5 because kbuild doesn't allow pcap.a to be linked +# in to pcap.o + +slip-objs := slip_kern.o slip_user.o +slirp-objs := slirp_kern.o slirp_user.o +daemon-objs := daemon_kern.o daemon_user.o +mcast-objs := mcast_kern.o mcast_user.o +#pcap-objs := pcap_kern.o pcap_user.o $(PCAP) +net-objs := net_kern.o net_user.o +mconsole-objs := mconsole_kern.o mconsole_user.o +hostaudio-objs := hostaudio_kern.o hostaudio_user.o +ubd-objs := ubd_kern.o ubd_user.o +port-objs := port_kern.o port_user.o +harddog-objs := harddog_kern.o harddog_user.o + +obj-y = +obj-$(CONFIG_SSL) += ssl.o +obj-$(CONFIG_UML_NET_SLIP) += slip.o +obj-$(CONFIG_UML_NET_SLIRP) += slirp.o +obj-$(CONFIG_UML_NET_DAEMON) += daemon.o +obj-$(CONFIG_UML_NET_MCAST) += mcast.o +#obj-$(CONFIG_UML_NET_PCAP) += pcap.o $(PCAP) +obj-$(CONFIG_UML_NET) += net.o +obj-$(CONFIG_MCONSOLE) += mconsole.o +obj-$(CONFIG_MMAPPER) += mmapper_kern.o +obj-$(CONFIG_BLK_DEV_UBD) += ubd.o +obj-$(CONFIG_HOSTAUDIO) += hostaudio.o +obj-$(CONFIG_FD_CHAN) += fd.o +obj-$(CONFIG_NULL_CHAN) += null.o +obj-$(CONFIG_PORT_CHAN) += port.o +obj-$(CONFIG_PTY_CHAN) += pty.o +obj-$(CONFIG_TTY_CHAN) += tty.o +obj-$(CONFIG_XTERM_CHAN) += xterm.o xterm_kern.o +obj-$(CONFIG_UML_WATCHDOG) += harddog.o + +obj-y += stdio_console.o $(CHAN_OBJS) + +USER_SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs)) + +USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) fd.o \ + null.o pty.o tty.o xterm.o +USER_OBJS := $(foreach file,$(USER_OBJS),arch/um/drivers/$(file)) + +$(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< + +clean: + +modules: + +fastdep: + +dep: + +archmrproper: clean + diff -Nru a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c --- a/arch/um/drivers/chan_kern.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/drivers/chan_kern.c Fri Oct 31 14:10:54 2003 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include "chan_kern.h" diff -Nru a/arch/um/drivers/chan_kern.c~uml-summa.diff b/arch/um/drivers/chan_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/chan_kern.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,509 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include "chan_kern.h" +#include "user_util.h" +#include "kern.h" +#include "irq_user.h" +#include "sigio.h" +#include "line.h" + +static void *not_configged_init(char *str, int device, struct chan_opts *opts) +{ + printk(KERN_ERR "Using a channel type which is configured out of " + "UML\n"); + return(NULL); +} + +static int not_configged_open(int input, int output, int primary, void *data, + char **dev_out) +{ + printk(KERN_ERR "Using a channel type which is configured out of " + "UML\n"); + return(-ENODEV); +} + +static void not_configged_close(int fd, void *data) +{ + printk(KERN_ERR "Using a channel type which is configured out of " + "UML\n"); +} + +static int not_configged_read(int fd, char *c_out, void *data) +{ + printk(KERN_ERR "Using a channel type which is configured out of " + "UML\n"); + return(-EIO); +} + +static int not_configged_write(int fd, const char *buf, int len, void *data) +{ + printk(KERN_ERR "Using a channel type which is configured out of " + "UML\n"); + return(-EIO); +} + +static int not_configged_console_write(int fd, const char *buf, int len, + void *data) +{ + printk(KERN_ERR "Using a channel type which is configured out of " + "UML\n"); + return(-EIO); +} + +static int not_configged_window_size(int fd, void *data, unsigned short *rows, + unsigned short *cols) +{ + printk(KERN_ERR "Using a channel type which is configured out of " + "UML\n"); + return(-ENODEV); +} + +static void not_configged_free(void *data) +{ + printk(KERN_ERR "Using a channel type which is configured out of " + "UML\n"); +} + +static struct chan_ops not_configged_ops = { + .init = not_configged_init, + .open = not_configged_open, + .close = not_configged_close, + .read = not_configged_read, + .write = not_configged_write, + .console_write = not_configged_console_write, + .window_size = not_configged_window_size, + .free = not_configged_free, + .winch = 0, +}; + +static void tty_receive_char(struct tty_struct *tty, char ch) +{ + if(tty == NULL) return; + + if(I_IXON(tty) && !I_IXOFF(tty) && !tty->raw) { + if(ch == STOP_CHAR(tty)){ + stop_tty(tty); + return; + } + else if(ch == START_CHAR(tty)){ + start_tty(tty); + return; + } + } + + if((tty->flip.flag_buf_ptr == NULL) || + (tty->flip.char_buf_ptr == NULL)) + return; + tty_insert_flip_char(tty, ch, TTY_NORMAL); +} + +static int open_one_chan(struct chan *chan, int input, int output, int primary) +{ + int fd; + + if(chan->opened) return(0); + if(chan->ops->open == NULL) fd = 0; + else fd = (*chan->ops->open)(input, output, primary, chan->data, + &chan->dev); + if(fd < 0) return(fd); + chan->fd = fd; + + chan->opened = 1; + return(0); +} + +int open_chan(struct list_head *chans) +{ + struct list_head *ele; + struct chan *chan; + int ret, err = 0; + + list_for_each(ele, chans){ + chan = list_entry(ele, struct chan, list); + ret = open_one_chan(chan, chan->input, chan->output, + chan->primary); + if(chan->primary) err = ret; + } + return(err); +} + +void chan_enable_winch(struct list_head *chans, void *line) +{ + struct list_head *ele; + struct chan *chan; + + list_for_each(ele, chans){ + chan = list_entry(ele, struct chan, list); + if(chan->primary && chan->output && chan->ops->winch){ + register_winch(chan->fd, line); + return; + } + } +} + +void enable_chan(struct list_head *chans, void *data) +{ + struct list_head *ele; + struct chan *chan; + + list_for_each(ele, chans){ + chan = list_entry(ele, struct chan, list); + if(!chan->opened) continue; + + line_setup_irq(chan->fd, chan->input, chan->output, data); + } +} + +void close_chan(struct list_head *chans) +{ + struct list_head *ele; + struct chan *chan; + + /* Close in reverse order as open in case more than one of them + * refers to the same device and they save and restore that device's + * state. Then, the first one opened will have the original state, + * so it must be the last closed. + */ + for(ele = chans->prev; ele != chans; ele = ele->prev){ + chan = list_entry(ele, struct chan, list); + if(!chan->opened) continue; + if(chan->ops->close != NULL) + (*chan->ops->close)(chan->fd, chan->data); + chan->opened = 0; + chan->fd = -1; + } +} + +int write_chan(struct list_head *chans, const char *buf, int len, + int write_irq) +{ + struct list_head *ele; + struct chan *chan; + int n, ret = 0; + + list_for_each(ele, chans){ + chan = list_entry(ele, struct chan, list); + if(!chan->output || (chan->ops->write == NULL)) continue; + n = chan->ops->write(chan->fd, buf, len, chan->data); + if(chan->primary){ + ret = n; + if((ret == -EAGAIN) || ((ret >= 0) && (ret < len))){ + reactivate_fd(chan->fd, write_irq); + if(ret == -EAGAIN) ret = 0; + } + } + } + return(ret); +} + +int console_write_chan(struct list_head *chans, const char *buf, int len) +{ + struct list_head *ele; + struct chan *chan; + int n, ret = 0; + + list_for_each(ele, chans){ + chan = list_entry(ele, struct chan, list); + if(!chan->output || (chan->ops->console_write == NULL)) + continue; + n = chan->ops->console_write(chan->fd, buf, len, chan->data); + if(chan->primary) ret = n; + } + return(ret); +} + +int chan_window_size(struct list_head *chans, unsigned short *rows_out, + unsigned short *cols_out) +{ + struct list_head *ele; + struct chan *chan; + + list_for_each(ele, chans){ + chan = list_entry(ele, struct chan, list); + if(chan->primary){ + if(chan->ops->window_size == NULL) return(0); + return(chan->ops->window_size(chan->fd, chan->data, + rows_out, cols_out)); + } + } + return(0); +} + +void free_one_chan(struct chan *chan) +{ + list_del(&chan->list); + if(chan->ops->free != NULL) + (*chan->ops->free)(chan->data); + free_irq_by_fd(chan->fd); + if(chan->primary && chan->output) ignore_sigio_fd(chan->fd); + kfree(chan); +} + +void free_chan(struct list_head *chans) +{ + struct list_head *ele, *next; + struct chan *chan; + + list_for_each_safe(ele, next, chans){ + chan = list_entry(ele, struct chan, list); + free_one_chan(chan); + } +} + +static int one_chan_config_string(struct chan *chan, char *str, int size, + char **error_out) +{ + int n = 0; + + CONFIG_CHUNK(str, size, n, chan->ops->type, 0); + + if(chan->dev == NULL){ + CONFIG_CHUNK(str, size, n, "", 1); + return(n); + } + + CONFIG_CHUNK(str, size, n, ":", 0); + CONFIG_CHUNK(str, size, n, chan->dev, 0); + + return(n); +} + +static int chan_pair_config_string(struct chan *in, struct chan *out, + char *str, int size, char **error_out) +{ + int n; + + n = one_chan_config_string(in, str, size, error_out); + str += n; + size -= n; + + if(in == out){ + CONFIG_CHUNK(str, size, n, "", 1); + return(n); + } + + CONFIG_CHUNK(str, size, n, ",", 1); + n = one_chan_config_string(out, str, size, error_out); + str += n; + size -= n; + CONFIG_CHUNK(str, size, n, "", 1); + + return(n); +} + +int chan_config_string(struct list_head *chans, char *str, int size, + char **error_out) +{ + struct list_head *ele; + struct chan *chan, *in = NULL, *out = NULL; + + list_for_each(ele, chans){ + chan = list_entry(ele, struct chan, list); + if(!chan->primary) + continue; + if(chan->input) + in = chan; + if(chan->output) + out = chan; + } + + return(chan_pair_config_string(in, out, str, size, error_out)); +} + +struct chan_type { + char *key; + struct chan_ops *ops; +}; + +struct chan_type chan_table[] = { +#ifdef CONFIG_FD_CHAN + { "fd", &fd_ops }, +#else + { "fd", ¬_configged_ops }, +#endif + +#ifdef CONFIG_NULL_CHAN + { "null", &null_ops }, +#else + { "null", ¬_configged_ops }, +#endif + +#ifdef CONFIG_PORT_CHAN + { "port", &port_ops }, +#else + { "port", ¬_configged_ops }, +#endif + +#ifdef CONFIG_PTY_CHAN + { "pty", &pty_ops }, + { "pts", &pts_ops }, +#else + { "pty", ¬_configged_ops }, + { "pts", ¬_configged_ops }, +#endif + +#ifdef CONFIG_TTY_CHAN + { "tty", &tty_ops }, +#else + { "tty", ¬_configged_ops }, +#endif + +#ifdef CONFIG_XTERM_CHAN + { "xterm", &xterm_ops }, +#else + { "xterm", ¬_configged_ops }, +#endif +}; + +static struct chan *parse_chan(char *str, int pri, int device, + struct chan_opts *opts) +{ + struct chan_type *entry; + struct chan_ops *ops; + struct chan *chan; + void *data; + int i; + + ops = NULL; + data = NULL; + for(i = 0; i < sizeof(chan_table)/sizeof(chan_table[0]); i++){ + entry = &chan_table[i]; + if(!strncmp(str, entry->key, strlen(entry->key))){ + ops = entry->ops; + str += strlen(entry->key); + break; + } + } + if(ops == NULL){ + printk(KERN_ERR "parse_chan couldn't parse \"%s\"\n", + str); + return(NULL); + } + if(ops->init == NULL) return(NULL); + data = (*ops->init)(str, device, opts); + if(data == NULL) return(NULL); + + chan = kmalloc(sizeof(*chan), GFP_KERNEL); + if(chan == NULL) return(NULL); + *chan = ((struct chan) { .list = LIST_HEAD_INIT(chan->list), + .primary = 1, + .input = 0, + .output = 0, + .opened = 0, + .fd = -1, + .pri = pri, + .ops = ops, + .data = data }); + return(chan); +} + +int parse_chan_pair(char *str, struct list_head *chans, int pri, int device, + struct chan_opts *opts) +{ + struct chan *new, *chan; + char *in, *out; + + if(!list_empty(chans)){ + chan = list_entry(chans->next, struct chan, list); + if(chan->pri >= pri) return(0); + free_chan(chans); + INIT_LIST_HEAD(chans); + } + + if((out = strchr(str, ',')) != NULL){ + in = str; + *out = '\0'; + out++; + new = parse_chan(in, pri, device, opts); + if(new == NULL) return(-1); + new->input = 1; + list_add(&new->list, chans); + + new = parse_chan(out, pri, device, opts); + if(new == NULL) return(-1); + list_add(&new->list, chans); + new->output = 1; + } + else { + new = parse_chan(str, pri, device, opts); + if(new == NULL) return(-1); + list_add(&new->list, chans); + new->input = 1; + new->output = 1; + } + return(0); +} + +int chan_out_fd(struct list_head *chans) +{ + struct list_head *ele; + struct chan *chan; + + list_for_each(ele, chans){ + chan = list_entry(ele, struct chan, list); + if(chan->primary && chan->output) + return(chan->fd); + } + return(-1); +} + +void chan_interrupt(struct list_head *chans, struct work_struct *task, + struct tty_struct *tty, int irq, void *dev) +{ + struct list_head *ele, *next; + struct chan *chan; + int err; + char c; + + list_for_each_safe(ele, next, chans){ + chan = list_entry(ele, struct chan, list); + if(!chan->input || (chan->ops->read == NULL)) continue; + do { + if((tty != NULL) && + (tty->flip.count >= TTY_FLIPBUF_SIZE)){ + schedule_work(task); + goto out; + } + err = chan->ops->read(chan->fd, &c, chan->data); + if(err > 0) tty_receive_char(tty, c); + } while(err > 0); + if(err == 0) reactivate_fd(chan->fd, irq); + if(err == -EIO){ + if(chan->primary){ + if(tty != NULL) tty_hangup(tty); + line_disable(dev, irq); + close_chan(chans); + free_chan(chans); + return; + } + else { + if(chan->ops->close != NULL) + chan->ops->close(chan->fd, chan->data); + free_one_chan(chan); + } + } + } + out: + if(tty) tty_flip_buffer_push(tty); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c --- a/arch/um/drivers/chan_user.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/chan_user.c Fri Oct 31 14:10:53 2003 @@ -188,8 +188,8 @@ if(!isatty(fd)) return; pid = tcgetpgrp(fd); - if(!CHOOSE_MODE(is_tracer_winch(pid, fd, device_data), 0) && - (pid == -1)){ + if(!CHOOSE_MODE_PROC(is_tracer_winch, is_skas_winch, pid, fd, + device_data) && (pid == -1)){ thread = winch_tramp(fd, device_data, &thread_fd); if(fd != -1){ register_winch_irq(thread_fd, fd, thread, device_data); diff -Nru a/arch/um/drivers/chan_user.c~uml-summa.diff b/arch/um/drivers/chan_user.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/chan_user.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,213 @@ +/* + * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kern_util.h" +#include "user_util.h" +#include "chan_user.h" +#include "user.h" +#include "helper.h" +#include "os.h" +#include "choose-mode.h" +#include "mode.h" + +void generic_close(int fd, void *unused) +{ + close(fd); +} + +int generic_read(int fd, char *c_out, void *unused) +{ + int n; + + n = read(fd, c_out, sizeof(*c_out)); + if(n < 0){ + if(errno == EAGAIN) return(0); + return(-errno); + } + else if(n == 0) return(-EIO); + return(1); +} + +int generic_write(int fd, const char *buf, int n, void *unused) +{ + int count; + + count = write(fd, buf, n); + if(count < 0) return(-errno); + return(count); +} + +int generic_console_write(int fd, const char *buf, int n, void *unused) +{ + struct termios save, new; + int err; + + if(isatty(fd)){ + tcgetattr(fd, &save); + new = save; + new.c_oflag |= OPOST; + tcsetattr(fd, TCSAFLUSH, &new); + } + err = generic_write(fd, buf, n, NULL); + if(isatty(fd)) tcsetattr(fd, TCSAFLUSH, &save); + return(err); +} + +int generic_window_size(int fd, void *unused, unsigned short *rows_out, + unsigned short *cols_out) +{ + struct winsize size; + int ret = 0; + + if(ioctl(fd, TIOCGWINSZ, &size) == 0){ + ret = ((*rows_out != size.ws_row) || + (*cols_out != size.ws_col)); + *rows_out = size.ws_row; + *cols_out = size.ws_col; + } + return(ret); +} + +void generic_free(void *data) +{ + kfree(data); +} + +static void winch_handler(int sig) +{ +} + +struct winch_data { + int pty_fd; + int pipe_fd; + int close_me; +}; + +static int winch_thread(void *arg) +{ + struct winch_data *data = arg; + sigset_t sigs; + int pty_fd, pipe_fd; + char c = 1; + + close(data->close_me); + pty_fd = data->pty_fd; + pipe_fd = data->pipe_fd; + if(write(pipe_fd, &c, sizeof(c)) != sizeof(c)) + printk("winch_thread : failed to write synchronization " + "byte, errno = %d\n", errno); + + signal(SIGWINCH, winch_handler); + sigfillset(&sigs); + sigdelset(&sigs, SIGWINCH); + if(sigprocmask(SIG_SETMASK, &sigs, NULL) < 0){ + printk("winch_thread : sigprocmask failed, errno = %d\n", + errno); + exit(1); + } + + if(setsid() < 0){ + printk("winch_thread : setsid failed, errno = %d\n", errno); + exit(1); + } + + if(ioctl(pty_fd, TIOCSCTTY, 0) < 0){ + printk("winch_thread : TIOCSCTTY failed, errno = %d\n", errno); + exit(1); + } + if(tcsetpgrp(pty_fd, os_getpid()) < 0){ + printk("winch_thread : tcsetpgrp failed, errno = %d\n", errno); + exit(1); + } + + if(read(pipe_fd, &c, sizeof(c)) != sizeof(c)) + printk("winch_thread : failed to read synchronization byte, " + "errno = %d\n", errno); + + while(1){ + pause(); + + if(write(pipe_fd, &c, sizeof(c)) != sizeof(c)){ + printk("winch_thread : write failed, errno = %d\n", + errno); + } + } +} + +static int winch_tramp(int fd, void *device_data, int *fd_out) +{ + struct winch_data data; + unsigned long stack; + int fds[2], pid, n, err; + char c; + + err = os_pipe(fds, 1, 1); + if(err){ + printk("winch_tramp : os_pipe failed, errno = %d\n", -err); + return(err); + } + + data = ((struct winch_data) { .pty_fd = fd, + .pipe_fd = fds[1], + .close_me = fds[0] } ); + pid = run_helper_thread(winch_thread, &data, 0, &stack, 0); + if(pid < 0){ + printk("fork of winch_thread failed - errno = %d\n", errno); + return(pid); + } + + close(fds[1]); + *fd_out = fds[0]; + n = read(fds[0], &c, sizeof(c)); + if(n != sizeof(c)){ + printk("winch_tramp : failed to read synchronization byte\n"); + printk("read returned %d, errno = %d\n", n, errno); + printk("fd %d will not support SIGWINCH\n", fd); + *fd_out = -1; + } + return(pid); +} + +void register_winch(int fd, void *device_data) +{ + int pid, thread, thread_fd; + char c = 1; + + if(!isatty(fd)) return; + + pid = tcgetpgrp(fd); + if(!CHOOSE_MODE(is_tracer_winch(pid, fd, device_data), 0) && + (pid == -1)){ + thread = winch_tramp(fd, device_data, &thread_fd); + if(fd != -1){ + register_winch_irq(thread_fd, fd, thread, device_data); + + if(write(thread_fd, &c, sizeof(c)) != sizeof(c)) + printk("register_winch : failed to write " + "synchronization byte\n"); + } + } +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/hostaudio_kern.c b/arch/um/drivers/hostaudio_kern.c --- a/arch/um/drivers/hostaudio_kern.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/drivers/hostaudio_kern.c Fri Oct 31 14:10:54 2003 @@ -11,6 +11,7 @@ #include "linux/fs.h" #include "linux/sound.h" #include "linux/soundcard.h" +#include "asm/uaccess.h" #include "kern_util.h" #include "init.h" #include "hostaudio.h" @@ -22,7 +23,7 @@ #ifndef MODULE static int set_dsp(char *name, int *add) { - dsp = uml_strdup(name); + dsp = name; return(0); } @@ -34,7 +35,7 @@ static int set_mixer(char *name, int *add) { - mixer = uml_strdup(name); + mixer = name; return(0); } @@ -51,23 +52,55 @@ loff_t *ppos) { struct hostaudio_state *state = file->private_data; + void *kbuf; + int err; #ifdef DEBUG printk("hostaudio: read called, count = %d\n", count); #endif - return(hostaudio_read_user(state, buffer, count, ppos)); + kbuf = kmalloc(count, GFP_KERNEL); + if(kbuf == NULL) + return(-ENOMEM); + + err = hostaudio_read_user(state, kbuf, count, ppos); + if(err < 0) + goto out; + + if(copy_to_user(buffer, kbuf, err)) + err = -EFAULT; + + out: + kfree(kbuf); + return(err); } static ssize_t hostaudio_write(struct file *file, const char *buffer, size_t count, loff_t *ppos) { struct hostaudio_state *state = file->private_data; + void *kbuf; + int err; #ifdef DEBUG printk("hostaudio: write called, count = %d\n", count); #endif - return(hostaudio_write_user(state, buffer, count, ppos)); + + kbuf = kmalloc(count, GFP_KERNEL); + if(kbuf == NULL) + return(-ENOMEM); + + err = -EFAULT; + if(copy_from_user(kbuf, buffer, count)) + goto out; + + err = hostaudio_write_user(state, kbuf, count, ppos); + if(err < 0) + goto out; + + out: + kfree(kbuf); + return(err); } static unsigned int hostaudio_poll(struct file *file, @@ -86,12 +119,43 @@ unsigned int cmd, unsigned long arg) { struct hostaudio_state *state = file->private_data; + unsigned long data = 0; + int err; #ifdef DEBUG printk("hostaudio: ioctl called, cmd = %u\n", cmd); #endif + switch(cmd){ + case SNDCTL_DSP_SPEED: + case SNDCTL_DSP_STEREO: + case SNDCTL_DSP_GETBLKSIZE: + case SNDCTL_DSP_CHANNELS: + case SNDCTL_DSP_SUBDIVIDE: + case SNDCTL_DSP_SETFRAGMENT: + if(get_user(data, (int *) arg)) + return(-EFAULT); + break; + default: + break; + } + + err = hostaudio_ioctl_user(state, cmd, (unsigned long) &data); + + switch(cmd){ + case SNDCTL_DSP_SPEED: + case SNDCTL_DSP_STEREO: + case SNDCTL_DSP_GETBLKSIZE: + case SNDCTL_DSP_CHANNELS: + case SNDCTL_DSP_SUBDIVIDE: + case SNDCTL_DSP_SETFRAGMENT: + if(put_user(data, (int *) arg)) + return(-EFAULT); + break; + default: + break; + } - return(hostaudio_ioctl_user(state, cmd, arg)); + return(err); } static int hostaudio_open(struct inode *inode, struct file *file) @@ -225,7 +289,8 @@ static int __init hostaudio_init_module(void) { - printk(KERN_INFO "UML Audio Relay\n"); + printk(KERN_INFO "UML Audio Relay (host dsp = %s, host mixer = %s)\n", + dsp, mixer); module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1); if(module_data.dev_audio < 0){ diff -Nru a/arch/um/drivers/hostaudio_kern.c~uml-summa.diff b/arch/um/drivers/hostaudio_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/hostaudio_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,265 @@ +/* + * Copyright (C) 2002 Steve Schmidtke + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/module.h" +#include "linux/version.h" +#include "linux/init.h" +#include "linux/slab.h" +#include "linux/fs.h" +#include "linux/sound.h" +#include "linux/soundcard.h" +#include "kern_util.h" +#include "init.h" +#include "hostaudio.h" + +/* Only changed from linux_main at boot time */ +char *dsp = HOSTAUDIO_DEV_DSP; +char *mixer = HOSTAUDIO_DEV_MIXER; + +#ifndef MODULE +static int set_dsp(char *name, int *add) +{ + dsp = uml_strdup(name); + return(0); +} + +__uml_setup("dsp=", set_dsp, +"dsp=\n" +" This is used to specify the host dsp device to the hostaudio driver.\n" +" The default is \"" HOSTAUDIO_DEV_DSP "\".\n\n" +); + +static int set_mixer(char *name, int *add) +{ + mixer = uml_strdup(name); + return(0); +} + +__uml_setup("mixer=", set_mixer, +"mixer=\n" +" This is used to specify the host mixer device to the hostaudio driver.\n" +" The default is \"" HOSTAUDIO_DEV_MIXER "\".\n\n" +); +#endif + +/* /dev/dsp file operations */ + +static ssize_t hostaudio_read(struct file *file, char *buffer, size_t count, + loff_t *ppos) +{ + struct hostaudio_state *state = file->private_data; + +#ifdef DEBUG + printk("hostaudio: read called, count = %d\n", count); +#endif + + return(hostaudio_read_user(state, buffer, count, ppos)); +} + +static ssize_t hostaudio_write(struct file *file, const char *buffer, + size_t count, loff_t *ppos) +{ + struct hostaudio_state *state = file->private_data; + +#ifdef DEBUG + printk("hostaudio: write called, count = %d\n", count); +#endif + return(hostaudio_write_user(state, buffer, count, ppos)); +} + +static unsigned int hostaudio_poll(struct file *file, + struct poll_table_struct *wait) +{ + unsigned int mask = 0; + +#ifdef DEBUG + printk("hostaudio: poll called (unimplemented)\n"); +#endif + + return(mask); +} + +static int hostaudio_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct hostaudio_state *state = file->private_data; + +#ifdef DEBUG + printk("hostaudio: ioctl called, cmd = %u\n", cmd); +#endif + + return(hostaudio_ioctl_user(state, cmd, arg)); +} + +static int hostaudio_open(struct inode *inode, struct file *file) +{ + struct hostaudio_state *state; + int r = 0, w = 0; + int ret; + +#ifdef DEBUG + printk("hostaudio: open called (host: %s)\n", dsp); +#endif + + state = kmalloc(sizeof(struct hostaudio_state), GFP_KERNEL); + if(state == NULL) return(-ENOMEM); + + if(file->f_mode & FMODE_READ) r = 1; + if(file->f_mode & FMODE_WRITE) w = 1; + + ret = hostaudio_open_user(state, r, w, dsp); + if(ret < 0){ + kfree(state); + return(ret); + } + + file->private_data = state; + return(0); +} + +static int hostaudio_release(struct inode *inode, struct file *file) +{ + struct hostaudio_state *state = file->private_data; + int ret; + +#ifdef DEBUG + printk("hostaudio: release called\n"); +#endif + + ret = hostaudio_release_user(state); + kfree(state); + + return(ret); +} + +/* /dev/mixer file operations */ + +static int hostmixer_ioctl_mixdev(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct hostmixer_state *state = file->private_data; + +#ifdef DEBUG + printk("hostmixer: ioctl called\n"); +#endif + + return(hostmixer_ioctl_mixdev_user(state, cmd, arg)); +} + +static int hostmixer_open_mixdev(struct inode *inode, struct file *file) +{ + struct hostmixer_state *state; + int r = 0, w = 0; + int ret; + +#ifdef DEBUG + printk("hostmixer: open called (host: %s)\n", mixer); +#endif + + state = kmalloc(sizeof(struct hostmixer_state), GFP_KERNEL); + if(state == NULL) return(-ENOMEM); + + if(file->f_mode & FMODE_READ) r = 1; + if(file->f_mode & FMODE_WRITE) w = 1; + + ret = hostmixer_open_mixdev_user(state, r, w, mixer); + + if(ret < 0){ + kfree(state); + return(ret); + } + + file->private_data = state; + return(0); +} + +static int hostmixer_release(struct inode *inode, struct file *file) +{ + struct hostmixer_state *state = file->private_data; + int ret; + +#ifdef DEBUG + printk("hostmixer: release called\n"); +#endif + + ret = hostmixer_release_mixdev_user(state); + kfree(state); + + return(ret); +} + + +/* kernel module operations */ + +static struct file_operations hostaudio_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = hostaudio_read, + .write = hostaudio_write, + .poll = hostaudio_poll, + .ioctl = hostaudio_ioctl, + .mmap = NULL, + .open = hostaudio_open, + .release = hostaudio_release, +}; + +static struct file_operations hostmixer_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .ioctl = hostmixer_ioctl_mixdev, + .open = hostmixer_open_mixdev, + .release = hostmixer_release, +}; + +struct { + int dev_audio; + int dev_mixer; +} module_data; + +MODULE_AUTHOR("Steve Schmidtke"); +MODULE_DESCRIPTION("UML Audio Relay"); +MODULE_LICENSE("GPL"); + +static int __init hostaudio_init_module(void) +{ + printk(KERN_INFO "UML Audio Relay\n"); + + module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1); + if(module_data.dev_audio < 0){ + printk(KERN_ERR "hostaudio: couldn't register DSP device!\n"); + return -ENODEV; + } + + module_data.dev_mixer = register_sound_mixer(&hostmixer_fops, -1); + if(module_data.dev_mixer < 0){ + printk(KERN_ERR "hostmixer: couldn't register mixer " + "device!\n"); + unregister_sound_dsp(module_data.dev_audio); + return -ENODEV; + } + + return 0; +} + +static void __exit hostaudio_cleanup_module (void) +{ + unregister_sound_mixer(module_data.dev_mixer); + unregister_sound_dsp(module_data.dev_audio); +} + +module_init(hostaudio_init_module); +module_exit(hostaudio_cleanup_module); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/line.c b/arch/um/drivers/line.c --- a/arch/um/drivers/line.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/drivers/line.c Fri Oct 31 14:10:54 2003 @@ -6,8 +6,8 @@ #include "linux/sched.h" #include "linux/slab.h" #include "linux/list.h" +#include "linux/interrupt.h" #include "linux/devfs_fs_kernel.h" -#include "asm/irq.h" #include "asm/uaccess.h" #include "chan_kern.h" #include "irq_user.h" @@ -19,13 +19,14 @@ #define LINE_BUFSIZE 4096 -void line_interrupt(int irq, void *data, struct pt_regs *unused) +irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused) { struct line *dev = data; if(dev->count > 0) chan_interrupt(&dev->chan_list, &dev->task, dev->tty, irq, dev); + return IRQ_HANDLED; } void line_timer_cb(void *arg) @@ -136,20 +137,22 @@ return(len); } -void line_write_interrupt(int irq, void *data, struct pt_regs *unused) +irqreturn_t line_write_interrupt(int irq, void *data, struct pt_regs *unused) { struct line *dev = data; struct tty_struct *tty = dev->tty; int err; err = flush_buffer(dev); - if(err == 0) return; + if(err == 0) + return(IRQ_NONE); else if(err < 0){ dev->head = dev->buffer; dev->tail = dev->buffer; } - if(tty == NULL) return; + if(tty == NULL) + return(IRQ_NONE); if(test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && (tty->ldisc.write_wakeup != NULL)) @@ -161,9 +164,9 @@ * writes. */ - if (waitqueue_active(&tty->write_wait)) + if(waitqueue_active(&tty->write_wait)) wake_up_interruptible(&tty->write_wait); - + return(IRQ_HANDLED); } int line_write_room(struct tty_struct *tty) @@ -369,7 +372,7 @@ dev = simple_strtoul(name, &end, 0); if((*end != '\0') || (end == name)){ - *error_out = "line_setup failed to parse device number"; + *error_out = "line_get_config failed to parse device number"; return(0); } @@ -379,15 +382,15 @@ } line = &lines[dev]; + down(&line->sem); - if(!line->valid) CONFIG_CHUNK(str, size, n, "none", 1); else if(line->count == 0) CONFIG_CHUNK(str, size, n, line->init_str, 1); else n = chan_config_string(&line->chan_list, str, size, error_out); - up(&line->sem); + return(n); } @@ -412,7 +415,8 @@ return NULL; driver->driver_name = line_driver->name; - driver->name = line_driver->devfs_name; + driver->name = line_driver->device_name; + driver->devfs_name = line_driver->devfs_name; driver->major = line_driver->major; driver->minor_start = line_driver->minor_start; driver->type = line_driver->type; @@ -432,7 +436,7 @@ for(i = 0; i < nlines; i++){ if(!lines[i].valid) - tty_unregister_devfs(driver, i); + tty_unregister_device(driver, i); } mconsole_register_dev(&line_driver->mc); @@ -465,24 +469,25 @@ struct line *line; }; -void winch_interrupt(int irq, void *data, struct pt_regs *unused) +irqreturn_t winch_interrupt(int irq, void *data, struct pt_regs *unused) { struct winch *winch = data; struct tty_struct *tty; int err; char c; - err = generic_read(winch->fd, &c, NULL); - if(err < 0){ - if(err != -EAGAIN){ - printk("winch_interrupt : read failed, errno = %d\n", - -err); - printk("fd %d is losing SIGWINCH support\n", - winch->tty_fd); - free_irq(irq, data); - return; + if(winch->fd != -1){ + err = generic_read(winch->fd, &c, NULL); + if(err < 0){ + if(err != -EAGAIN){ + printk("winch_interrupt : read failed, " + "errno = %d\n", -err); + printk("fd %d is losing SIGWINCH support\n", + winch->tty_fd); + return(IRQ_HANDLED); + } + goto out; } - goto out; } tty = winch->line->tty; if(tty != NULL){ @@ -492,7 +497,9 @@ kill_pg(tty->pgrp, SIGWINCH, 1); } out: - reactivate_fd(winch->fd, WINCH_IRQ); + if(winch->fd != -1) + reactivate_fd(winch->fd, WINCH_IRQ); + return(IRQ_HANDLED); } DECLARE_MUTEX(winch_handler_sem); @@ -529,7 +536,10 @@ list_for_each(ele, &winch_handlers){ winch = list_entry(ele, struct winch, list); - close(winch->fd); + if(winch->fd != -1){ + deactivate_fd(winch->fd, WINCH_IRQ); + close(winch->fd); + } if(winch->pid != -1) os_kill_process(winch->pid, 1); } diff -Nru a/arch/um/drivers/line.c~uml-kill-irq_kern.h.diff b/arch/um/drivers/line.c~uml-kill-irq_kern.h.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/line.c~uml-kill-irq_kern.h.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,579 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/slab.h" +#include "linux/list.h" +#include "linux/interrupt.h" +#include "linux/devfs_fs_kernel.h" +#include "asm/uaccess.h" +#include "chan_kern.h" +#include "irq_user.h" +#include "line.h" +#include "kern.h" +#include "user_util.h" +#include "kern_util.h" +#include "os.h" +#include "irq_kern.h" + +#define LINE_BUFSIZE 4096 + +irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused) +{ + struct line *dev = data; + + if(dev->count > 0) + chan_interrupt(&dev->chan_list, &dev->task, dev->tty, irq, + dev); + return IRQ_HANDLED; +} + +void line_timer_cb(void *arg) +{ + struct line *dev = arg; + + line_interrupt(dev->driver->read_irq, dev, NULL); +} + +static void buffer_data(struct line *line, const char *buf, int len) +{ + int end; + + if(line->buffer == NULL){ + line->buffer = kmalloc(LINE_BUFSIZE, GFP_ATOMIC); + if(line->buffer == NULL){ + printk("buffer_data - atomic allocation failed\n"); + return; + } + line->head = line->buffer; + line->tail = line->buffer; + } + end = line->buffer + LINE_BUFSIZE - line->tail; + if(len < end){ + memcpy(line->tail, buf, len); + line->tail += len; + } + else { + memcpy(line->tail, buf, end); + buf += end; + len -= end; + memcpy(line->buffer, buf, len); + line->tail = line->buffer + len; + } +} + +static int flush_buffer(struct line *line) +{ + int n, count; + + if((line->buffer == NULL) || (line->head == line->tail)) return(1); + + if(line->tail < line->head){ + count = line->buffer + LINE_BUFSIZE - line->head; + n = write_chan(&line->chan_list, line->head, count, + line->driver->write_irq); + if(n < 0) return(n); + if(n == count) line->head = line->buffer; + else { + line->head += n; + return(0); + } + } + + count = line->tail - line->head; + n = write_chan(&line->chan_list, line->head, count, + line->driver->write_irq); + if(n < 0) return(n); + + line->head += n; + return(line->head == line->tail); +} + +int line_write(struct line *lines, struct tty_struct *tty, int from_user, + const char *buf, int len) +{ + struct line *line; + char *new; + unsigned long flags; + int n, err, i; + + if(tty->stopped) return 0; + + if(from_user){ + new = kmalloc(len, GFP_KERNEL); + if(new == NULL) + return(0); + n = copy_from_user(new, buf, len); + if(n == len) + return(-EFAULT); + buf = new; + } + + i = tty->index; + line = &lines[i]; + + down(&line->sem); + if(line->head != line->tail){ + local_irq_save(flags); + buffer_data(line, buf, len); + err = flush_buffer(line); + local_irq_restore(flags); + if(err <= 0) + goto out; + } + else { + n = write_chan(&line->chan_list, buf, len, + line->driver->write_irq); + if(n < 0){ + len = n; + goto out; + } + if(n < len) + buffer_data(line, buf + n, len - n); + } + out: + up(&line->sem); + return(len); +} + +irqreturn_t line_write_interrupt(int irq, void *data, struct pt_regs *unused) +{ + struct line *dev = data; + struct tty_struct *tty = dev->tty; + int err; + + err = flush_buffer(dev); + if(err == 0) + return(IRQ_NONE); + else if(err < 0){ + dev->head = dev->buffer; + dev->tail = dev->buffer; + } + + if(tty == NULL) + return(IRQ_NONE); + + if(test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && + (tty->ldisc.write_wakeup != NULL)) + (tty->ldisc.write_wakeup)(tty); + + /* BLOCKING mode + * In blocking mode, everything sleeps on tty->write_wait. + * Sleeping in the console driver would break non-blocking + * writes. + */ + + if(waitqueue_active(&tty->write_wait)) + wake_up_interruptible(&tty->write_wait); + return(IRQ_HANDLED); +} + +int line_write_room(struct tty_struct *tty) +{ + struct line *dev = tty->driver_data; + int n; + + if(dev->buffer == NULL) return(LINE_BUFSIZE - 1); + + n = dev->head - dev->tail; + if(n <= 0) n = LINE_BUFSIZE + n; + return(n - 1); +} + +int line_setup_irq(int fd, int input, int output, void *data) +{ + struct line *line = data; + struct line_driver *driver = line->driver; + int err = 0, flags = SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM; + + if(input) err = um_request_irq(driver->read_irq, fd, IRQ_READ, + line_interrupt, flags, + driver->read_irq_name, line); + if(err) return(err); + if(output) err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, + line_write_interrupt, flags, + driver->write_irq_name, line); + line->have_irq = 1; + return(err); +} + +void line_disable(struct line *line, int current_irq) +{ + if(!line->have_irq) return; + + if(line->driver->read_irq == current_irq) + free_irq_later(line->driver->read_irq, line); + else + free_irq(line->driver->read_irq, line); + + if(line->driver->write_irq == current_irq) + free_irq_later(line->driver->write_irq, line); + else + free_irq(line->driver->write_irq, line); + + line->have_irq = 0; +} + +int line_open(struct line *lines, struct tty_struct *tty, + struct chan_opts *opts) +{ + struct line *line; + int n, err = 0; + + if(tty == NULL) n = 0; + else n = tty->index; + line = &lines[n]; + + down(&line->sem); + if(line->count == 0){ + if(!line->valid){ + err = -ENODEV; + goto out; + } + if(list_empty(&line->chan_list)){ + err = parse_chan_pair(line->init_str, &line->chan_list, + line->init_pri, n, opts); + if(err) goto out; + err = open_chan(&line->chan_list); + if(err) goto out; + } + enable_chan(&line->chan_list, line); + INIT_WORK(&line->task, line_timer_cb, line); + } + + if(!line->sigio){ + chan_enable_winch(&line->chan_list, line); + line->sigio = 1; + } + + /* This is outside the if because the initial console is opened + * with tty == NULL + */ + line->tty = tty; + + if(tty != NULL){ + tty->driver_data = line; + chan_window_size(&line->chan_list, &tty->winsize.ws_row, + &tty->winsize.ws_col); + } + + line->count++; + out: + up(&line->sem); + return(err); +} + +void line_close(struct line *lines, struct tty_struct *tty) +{ + struct line *line; + int n; + + if(tty == NULL) n = 0; + else n = tty->index; + line = &lines[n]; + + down(&line->sem); + line->count--; + + /* I don't like this, but I can't think of anything better. What's + * going on is that the tty is in the process of being closed for + * the last time. Its count hasn't been dropped yet, so it's still + * at 1. This may happen when line->count != 0 because of the initial + * console open (without a tty) bumping it up to 1. + */ + if((line->tty != NULL) && (line->tty->count == 1)) + line->tty = NULL; + if(line->count == 0) + line_disable(line, -1); + up(&line->sem); +} + +void close_lines(struct line *lines, int nlines) +{ + int i; + + for(i = 0; i < nlines; i++) + close_chan(&lines[i].chan_list); +} + +int line_setup(struct line *lines, int num, char *init, int all_allowed) +{ + int i, n; + char *end; + + if(*init == '=') n = -1; + else { + n = simple_strtoul(init, &end, 0); + if(*end != '='){ + printk(KERN_ERR "line_setup failed to parse \"%s\"\n", + init); + return(1); + } + init = end; + } + init++; + if((n >= 0) && (n >= num)){ + printk("line_setup - %d out of range ((0 ... %d) allowed)\n", + n, num); + return(1); + } + else if(n >= 0){ + if(lines[n].count > 0){ + printk("line_setup - device %d is open\n", n); + return(1); + } + if(lines[n].init_pri <= INIT_ONE){ + lines[n].init_pri = INIT_ONE; + if(!strcmp(init, "none")) lines[n].valid = 0; + else { + lines[n].init_str = init; + lines[n].valid = 1; + } + } + } + else if(!all_allowed){ + printk("line_setup - can't configure all devices from " + "mconsole\n"); + return(1); + } + else { + for(i = 0; i < num; i++){ + if(lines[i].init_pri <= INIT_ALL){ + lines[i].init_pri = INIT_ALL; + if(!strcmp(init, "none")) lines[i].valid = 0; + else { + lines[i].init_str = init; + lines[i].valid = 1; + } + } + } + } + return(0); +} + +int line_config(struct line *lines, int num, char *str) +{ + char *new = uml_strdup(str); + + if(new == NULL){ + printk("line_config - uml_strdup failed\n"); + return(-ENOMEM); + } + return(line_setup(lines, num, new, 0)); +} + +int line_get_config(char *name, struct line *lines, int num, char *str, + int size, char **error_out) +{ + struct line *line; + char *end; + int dev, n = 0; + + dev = simple_strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ + *error_out = "line_get_config failed to parse device number"; + return(0); + } + + if((dev < 0) || (dev >= num)){ + *error_out = "device number of of range"; + return(0); + } + + line = &lines[dev]; + + down(&line->sem); + if(!line->valid) + CONFIG_CHUNK(str, size, n, "none", 1); + else if(line->count == 0) + CONFIG_CHUNK(str, size, n, line->init_str, 1); + else n = chan_config_string(&line->chan_list, str, size, error_out); + up(&line->sem); + + return(n); +} + +int line_remove(struct line *lines, int num, char *str) +{ + char config[sizeof("conxxxx=none\0")]; + + sprintf(config, "%s=none", str); + return(line_setup(lines, num, config, 0)); +} + +struct tty_driver *line_register_devfs(struct lines *set, + struct line_driver *line_driver, + struct tty_operations *ops, struct line *lines, + int nlines) +{ + int err, i; + char *from, *to; + struct tty_driver *driver = alloc_tty_driver(nlines); + + if (!driver) + return NULL; + + driver->driver_name = line_driver->name; + driver->name = line_driver->device_name; + driver->devfs_name = line_driver->devfs_name; + driver->major = line_driver->major; + driver->minor_start = line_driver->minor_start; + driver->type = line_driver->type; + driver->subtype = line_driver->subtype; + driver->flags = TTY_DRIVER_REAL_RAW; + driver->init_termios = tty_std_termios; + tty_set_operations(driver, ops); + + if (tty_register_driver(driver)) + panic("line_register_devfs : Couldn't register driver\n"); + + from = line_driver->symlink_from; + to = line_driver->symlink_to; + err = devfs_mk_symlink(from, to); + if(err) printk("Symlink creation from /dev/%s to /dev/%s " + "returned %d\n", from, to, err); + + for(i = 0; i < nlines; i++){ + if(!lines[i].valid) + tty_unregister_device(driver, i); + } + + mconsole_register_dev(&line_driver->mc); + return driver; +} + +void lines_init(struct line *lines, int nlines) +{ + struct line *line; + int i; + + for(i = 0; i < nlines; i++){ + line = &lines[i]; + INIT_LIST_HEAD(&line->chan_list); + sema_init(&line->sem, 1); + if(line->init_str != NULL){ + line->init_str = uml_strdup(line->init_str); + if(line->init_str == NULL) + printk("lines_init - uml_strdup returned " + "NULL\n"); + } + } +} + +struct winch { + struct list_head list; + int fd; + int tty_fd; + int pid; + struct line *line; +}; + +irqreturn_t winch_interrupt(int irq, void *data, struct pt_regs *unused) +{ + struct winch *winch = data; + struct tty_struct *tty; + int err; + char c; + + if(winch->fd != -1){ + err = generic_read(winch->fd, &c, NULL); + if(err < 0){ + if(err != -EAGAIN){ + printk("winch_interrupt : read failed, " + "errno = %d\n", -err); + printk("fd %d is losing SIGWINCH support\n", + winch->tty_fd); + return(IRQ_HANDLED); + } + goto out; + } + } + tty = winch->line->tty; + if(tty != NULL){ + chan_window_size(&winch->line->chan_list, + &tty->winsize.ws_row, + &tty->winsize.ws_col); + kill_pg(tty->pgrp, SIGWINCH, 1); + } + out: + if(winch->fd != -1) + reactivate_fd(winch->fd, WINCH_IRQ); + return(IRQ_HANDLED); +} + +DECLARE_MUTEX(winch_handler_sem); +LIST_HEAD(winch_handlers); + +void register_winch_irq(int fd, int tty_fd, int pid, void *line) +{ + struct winch *winch; + + down(&winch_handler_sem); + winch = kmalloc(sizeof(*winch), GFP_KERNEL); + if(winch == NULL){ + printk("register_winch_irq - kmalloc failed\n"); + goto out; + } + *winch = ((struct winch) { .list = LIST_HEAD_INIT(winch->list), + .fd = fd, + .tty_fd = tty_fd, + .pid = pid, + .line = line }); + list_add(&winch->list, &winch_handlers); + if(um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, + "winch", winch) < 0) + printk("register_winch_irq - failed to register IRQ\n"); + out: + up(&winch_handler_sem); +} + +static void winch_cleanup(void) +{ + struct list_head *ele; + struct winch *winch; + + list_for_each(ele, &winch_handlers){ + winch = list_entry(ele, struct winch, list); + if(winch->fd != -1){ + deactivate_fd(winch->fd, WINCH_IRQ); + close(winch->fd); + } + if(winch->pid != -1) + os_kill_process(winch->pid, 1); + } +} + +__uml_exitcall(winch_cleanup); + +char *add_xterm_umid(char *base) +{ + char *umid, *title; + int len; + + umid = get_umid(1); + if(umid == NULL) return(base); + + len = strlen(base) + strlen(" ()") + strlen(umid) + 1; + title = kmalloc(len, GFP_KERNEL); + if(title == NULL){ + printk("Failed to allocate buffer for xterm title\n"); + return(base); + } + + snprintf(title, len, "%s (%s)", base, umid); + return(title); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/line.c~uml-summa.diff b/arch/um/drivers/line.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/line.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,568 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/slab.h" +#include "linux/list.h" +#include "linux/devfs_fs_kernel.h" +#include "asm/irq.h" +#include "asm/uaccess.h" +#include "chan_kern.h" +#include "irq_user.h" +#include "line.h" +#include "kern.h" +#include "user_util.h" +#include "kern_util.h" +#include "os.h" + +#define LINE_BUFSIZE 4096 + +void line_interrupt(int irq, void *data, struct pt_regs *unused) +{ + struct line *dev = data; + + if(dev->count > 0) + chan_interrupt(&dev->chan_list, &dev->task, dev->tty, irq, + dev); +} + +void line_timer_cb(void *arg) +{ + struct line *dev = arg; + + line_interrupt(dev->driver->read_irq, dev, NULL); +} + +static void buffer_data(struct line *line, const char *buf, int len) +{ + int end; + + if(line->buffer == NULL){ + line->buffer = kmalloc(LINE_BUFSIZE, GFP_ATOMIC); + if(line->buffer == NULL){ + printk("buffer_data - atomic allocation failed\n"); + return; + } + line->head = line->buffer; + line->tail = line->buffer; + } + end = line->buffer + LINE_BUFSIZE - line->tail; + if(len < end){ + memcpy(line->tail, buf, len); + line->tail += len; + } + else { + memcpy(line->tail, buf, end); + buf += end; + len -= end; + memcpy(line->buffer, buf, len); + line->tail = line->buffer + len; + } +} + +static int flush_buffer(struct line *line) +{ + int n, count; + + if((line->buffer == NULL) || (line->head == line->tail)) return(1); + + if(line->tail < line->head){ + count = line->buffer + LINE_BUFSIZE - line->head; + n = write_chan(&line->chan_list, line->head, count, + line->driver->write_irq); + if(n < 0) return(n); + if(n == count) line->head = line->buffer; + else { + line->head += n; + return(0); + } + } + + count = line->tail - line->head; + n = write_chan(&line->chan_list, line->head, count, + line->driver->write_irq); + if(n < 0) return(n); + + line->head += n; + return(line->head == line->tail); +} + +int line_write(struct line *lines, struct tty_struct *tty, int from_user, + const char *buf, int len) +{ + struct line *line; + char *new; + unsigned long flags; + int n, err, i; + + if(tty->stopped) return 0; + + if(from_user){ + new = kmalloc(len, GFP_KERNEL); + if(new == NULL) + return(0); + n = copy_from_user(new, buf, len); + if(n == len) + return(-EFAULT); + buf = new; + } + + i = tty->index; + line = &lines[i]; + + down(&line->sem); + if(line->head != line->tail){ + local_irq_save(flags); + buffer_data(line, buf, len); + err = flush_buffer(line); + local_irq_restore(flags); + if(err <= 0) + goto out; + } + else { + n = write_chan(&line->chan_list, buf, len, + line->driver->write_irq); + if(n < 0){ + len = n; + goto out; + } + if(n < len) + buffer_data(line, buf + n, len - n); + } + out: + up(&line->sem); + return(len); +} + +void line_write_interrupt(int irq, void *data, struct pt_regs *unused) +{ + struct line *dev = data; + struct tty_struct *tty = dev->tty; + int err; + + err = flush_buffer(dev); + if(err == 0) return; + else if(err < 0){ + dev->head = dev->buffer; + dev->tail = dev->buffer; + } + + if(tty == NULL) return; + + if(test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && + (tty->ldisc.write_wakeup != NULL)) + (tty->ldisc.write_wakeup)(tty); + + /* BLOCKING mode + * In blocking mode, everything sleeps on tty->write_wait. + * Sleeping in the console driver would break non-blocking + * writes. + */ + + if (waitqueue_active(&tty->write_wait)) + wake_up_interruptible(&tty->write_wait); + +} + +int line_write_room(struct tty_struct *tty) +{ + struct line *dev = tty->driver_data; + int n; + + if(dev->buffer == NULL) return(LINE_BUFSIZE - 1); + + n = dev->head - dev->tail; + if(n <= 0) n = LINE_BUFSIZE + n; + return(n - 1); +} + +int line_setup_irq(int fd, int input, int output, void *data) +{ + struct line *line = data; + struct line_driver *driver = line->driver; + int err = 0, flags = SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM; + + if(input) err = um_request_irq(driver->read_irq, fd, IRQ_READ, + line_interrupt, flags, + driver->read_irq_name, line); + if(err) return(err); + if(output) err = um_request_irq(driver->write_irq, fd, IRQ_WRITE, + line_write_interrupt, flags, + driver->write_irq_name, line); + line->have_irq = 1; + return(err); +} + +void line_disable(struct line *line, int current_irq) +{ + if(!line->have_irq) return; + + if(line->driver->read_irq == current_irq) + free_irq_later(line->driver->read_irq, line); + else + free_irq(line->driver->read_irq, line); + + if(line->driver->write_irq == current_irq) + free_irq_later(line->driver->write_irq, line); + else + free_irq(line->driver->write_irq, line); + + line->have_irq = 0; +} + +int line_open(struct line *lines, struct tty_struct *tty, + struct chan_opts *opts) +{ + struct line *line; + int n, err = 0; + + if(tty == NULL) n = 0; + else n = tty->index; + line = &lines[n]; + + down(&line->sem); + if(line->count == 0){ + if(!line->valid){ + err = -ENODEV; + goto out; + } + if(list_empty(&line->chan_list)){ + err = parse_chan_pair(line->init_str, &line->chan_list, + line->init_pri, n, opts); + if(err) goto out; + err = open_chan(&line->chan_list); + if(err) goto out; + } + enable_chan(&line->chan_list, line); + INIT_WORK(&line->task, line_timer_cb, line); + } + + if(!line->sigio){ + chan_enable_winch(&line->chan_list, line); + line->sigio = 1; + } + + /* This is outside the if because the initial console is opened + * with tty == NULL + */ + line->tty = tty; + + if(tty != NULL){ + tty->driver_data = line; + chan_window_size(&line->chan_list, &tty->winsize.ws_row, + &tty->winsize.ws_col); + } + + line->count++; + out: + up(&line->sem); + return(err); +} + +void line_close(struct line *lines, struct tty_struct *tty) +{ + struct line *line; + int n; + + if(tty == NULL) n = 0; + else n = tty->index; + line = &lines[n]; + + down(&line->sem); + line->count--; + + /* I don't like this, but I can't think of anything better. What's + * going on is that the tty is in the process of being closed for + * the last time. Its count hasn't been dropped yet, so it's still + * at 1. This may happen when line->count != 0 because of the initial + * console open (without a tty) bumping it up to 1. + */ + if((line->tty != NULL) && (line->tty->count == 1)) + line->tty = NULL; + if(line->count == 0) + line_disable(line, -1); + up(&line->sem); +} + +void close_lines(struct line *lines, int nlines) +{ + int i; + + for(i = 0; i < nlines; i++) + close_chan(&lines[i].chan_list); +} + +int line_setup(struct line *lines, int num, char *init, int all_allowed) +{ + int i, n; + char *end; + + if(*init == '=') n = -1; + else { + n = simple_strtoul(init, &end, 0); + if(*end != '='){ + printk(KERN_ERR "line_setup failed to parse \"%s\"\n", + init); + return(1); + } + init = end; + } + init++; + if((n >= 0) && (n >= num)){ + printk("line_setup - %d out of range ((0 ... %d) allowed)\n", + n, num); + return(1); + } + else if(n >= 0){ + if(lines[n].count > 0){ + printk("line_setup - device %d is open\n", n); + return(1); + } + if(lines[n].init_pri <= INIT_ONE){ + lines[n].init_pri = INIT_ONE; + if(!strcmp(init, "none")) lines[n].valid = 0; + else { + lines[n].init_str = init; + lines[n].valid = 1; + } + } + } + else if(!all_allowed){ + printk("line_setup - can't configure all devices from " + "mconsole\n"); + return(1); + } + else { + for(i = 0; i < num; i++){ + if(lines[i].init_pri <= INIT_ALL){ + lines[i].init_pri = INIT_ALL; + if(!strcmp(init, "none")) lines[i].valid = 0; + else { + lines[i].init_str = init; + lines[i].valid = 1; + } + } + } + } + return(0); +} + +int line_config(struct line *lines, int num, char *str) +{ + char *new = uml_strdup(str); + + if(new == NULL){ + printk("line_config - uml_strdup failed\n"); + return(-ENOMEM); + } + return(line_setup(lines, num, new, 0)); +} + +int line_get_config(char *name, struct line *lines, int num, char *str, + int size, char **error_out) +{ + struct line *line; + char *end; + int dev, n = 0; + + dev = simple_strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ + *error_out = "line_setup failed to parse device number"; + return(0); + } + + if((dev < 0) || (dev >= num)){ + *error_out = "device number of of range"; + return(0); + } + + line = &lines[dev]; + down(&line->sem); + + if(!line->valid) + CONFIG_CHUNK(str, size, n, "none", 1); + else if(line->count == 0) + CONFIG_CHUNK(str, size, n, line->init_str, 1); + else n = chan_config_string(&line->chan_list, str, size, error_out); + + up(&line->sem); + return(n); +} + +int line_remove(struct line *lines, int num, char *str) +{ + char config[sizeof("conxxxx=none\0")]; + + sprintf(config, "%s=none", str); + return(line_setup(lines, num, config, 0)); +} + +struct tty_driver *line_register_devfs(struct lines *set, + struct line_driver *line_driver, + struct tty_operations *ops, struct line *lines, + int nlines) +{ + int err, i; + char *from, *to; + struct tty_driver *driver = alloc_tty_driver(nlines); + + if (!driver) + return NULL; + + driver->driver_name = line_driver->name; + driver->name = line_driver->devfs_name; + driver->major = line_driver->major; + driver->minor_start = line_driver->minor_start; + driver->type = line_driver->type; + driver->subtype = line_driver->subtype; + driver->flags = TTY_DRIVER_REAL_RAW; + driver->init_termios = tty_std_termios; + tty_set_operations(driver, ops); + + if (tty_register_driver(driver)) + panic("line_register_devfs : Couldn't register driver\n"); + + from = line_driver->symlink_from; + to = line_driver->symlink_to; + err = devfs_mk_symlink(from, to); + if(err) printk("Symlink creation from /dev/%s to /dev/%s " + "returned %d\n", from, to, err); + + for(i = 0; i < nlines; i++){ + if(!lines[i].valid) + tty_unregister_devfs(driver, i); + } + + mconsole_register_dev(&line_driver->mc); + return driver; +} + +void lines_init(struct line *lines, int nlines) +{ + struct line *line; + int i; + + for(i = 0; i < nlines; i++){ + line = &lines[i]; + INIT_LIST_HEAD(&line->chan_list); + sema_init(&line->sem, 1); + if(line->init_str != NULL){ + line->init_str = uml_strdup(line->init_str); + if(line->init_str == NULL) + printk("lines_init - uml_strdup returned " + "NULL\n"); + } + } +} + +struct winch { + struct list_head list; + int fd; + int tty_fd; + int pid; + struct line *line; +}; + +void winch_interrupt(int irq, void *data, struct pt_regs *unused) +{ + struct winch *winch = data; + struct tty_struct *tty; + int err; + char c; + + err = generic_read(winch->fd, &c, NULL); + if(err < 0){ + if(err != -EAGAIN){ + printk("winch_interrupt : read failed, errno = %d\n", + -err); + printk("fd %d is losing SIGWINCH support\n", + winch->tty_fd); + free_irq(irq, data); + return; + } + goto out; + } + tty = winch->line->tty; + if(tty != NULL){ + chan_window_size(&winch->line->chan_list, + &tty->winsize.ws_row, + &tty->winsize.ws_col); + kill_pg(tty->pgrp, SIGWINCH, 1); + } + out: + reactivate_fd(winch->fd, WINCH_IRQ); +} + +DECLARE_MUTEX(winch_handler_sem); +LIST_HEAD(winch_handlers); + +void register_winch_irq(int fd, int tty_fd, int pid, void *line) +{ + struct winch *winch; + + down(&winch_handler_sem); + winch = kmalloc(sizeof(*winch), GFP_KERNEL); + if(winch == NULL){ + printk("register_winch_irq - kmalloc failed\n"); + goto out; + } + *winch = ((struct winch) { .list = LIST_HEAD_INIT(winch->list), + .fd = fd, + .tty_fd = tty_fd, + .pid = pid, + .line = line }); + list_add(&winch->list, &winch_handlers); + if(um_request_irq(WINCH_IRQ, fd, IRQ_READ, winch_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, + "winch", winch) < 0) + printk("register_winch_irq - failed to register IRQ\n"); + out: + up(&winch_handler_sem); +} + +static void winch_cleanup(void) +{ + struct list_head *ele; + struct winch *winch; + + list_for_each(ele, &winch_handlers){ + winch = list_entry(ele, struct winch, list); + close(winch->fd); + if(winch->pid != -1) + os_kill_process(winch->pid, 1); + } +} + +__uml_exitcall(winch_cleanup); + +char *add_xterm_umid(char *base) +{ + char *umid, *title; + int len; + + umid = get_umid(1); + if(umid == NULL) return(base); + + len = strlen(base) + strlen(" ()") + strlen(umid) + 1; + title = kmalloc(len, GFP_KERNEL); + if(title == NULL){ + printk("Failed to allocate buffer for xterm title\n"); + return(base); + } + + snprintf(title, len, "%s (%s)", base, umid); + return(title); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c --- a/arch/um/drivers/mconsole_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/mconsole_kern.c Fri Oct 31 14:10:53 2003 @@ -28,7 +28,7 @@ #include "os.h" #include "umid.h" -static int do_unlink_socket(struct notifier_block *notifier, +static int do_unlink_socket(struct notifier_block *notifier, unsigned long what, void *data) { return(mconsole_unlink_socket()); @@ -67,7 +67,7 @@ DECLARE_WORK(mconsole_work, mc_work_proc, NULL); -void mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) { int fd; struct mconsole_entry *new; @@ -88,6 +88,7 @@ } if(!list_empty(&mc_requests)) schedule_work(&mconsole_work); reactivate_fd(fd, MCONSOLE_IRQ); + return(IRQ_HANDLED); } void mconsole_version(struct mc_request *req) @@ -100,20 +101,34 @@ mconsole_reply(req, version, 0, 0); } +void mconsole_log(struct mc_request *req) +{ + int len; + char *ptr = req->request.data; + + ptr += strlen("log"); + while(isspace(*ptr)) ptr++; + + len = ptr - req->request.data; + printk("%.*s", len, ptr); + mconsole_reply(req, "", 0, 0); +} + #define UML_MCONSOLE_HELPTEXT \ -"Commands: - version - Get kernel version - help - Print this message - halt - Halt UML - reboot - Reboot UML - config = - Add a new device to UML; - same syntax as command line - config - Query the configuration of a device - remove - Remove a device from UML - sysrq - Performs the SysRq action controlled by the letter - cad - invoke the Ctl-Alt-Del handler - stop - pause the UML; it will do nothing until it receives a 'go' - go - continue the UML after a 'stop' +"Commands: \n\ + version - Get kernel version \n\ + help - Print this message \n\ + halt - Halt UML \n\ + reboot - Reboot UML \n\ + config = - Add a new device to UML; \n\ + same syntax as command line \n\ + config - Query the configuration of a device \n\ + remove - Remove a device from UML \n\ + sysrq - Performs the SysRq action controlled by the letter \n\ + cad - invoke the Ctl-Alt-Del handler \n\ + stop - pause the UML; it will do nothing until it receives a 'go' \n\ + go - continue the UML after a 'stop' \n\ + log - make UML enter into the kernel log\n\ " void mconsole_help(struct mc_request *req) @@ -302,7 +317,7 @@ if(umid_file_name("mconsole", file, sizeof(file))) return(-1); snprintf(mconsole_socket_name, sizeof(file), "%s", file); - sock = create_unix_socket(file, sizeof(file)); + sock = create_unix_socket(file, sizeof(file), 1); if (sock < 0){ printk("Failed to initialize management console\n"); return(1); diff -Nru a/arch/um/drivers/mconsole_kern.c~uml-kill-irq_kern.h.diff b/arch/um/drivers/mconsole_kern.c~uml-kill-irq_kern.h.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/mconsole_kern.c~uml-kill-irq_kern.h.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,466 @@ +/* + * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/slab.h" +#include "linux/init.h" +#include "linux/notifier.h" +#include "linux/reboot.h" +#include "linux/utsname.h" +#include "linux/ctype.h" +#include "linux/interrupt.h" +#include "linux/sysrq.h" +#include "linux/workqueue.h" +#include "linux/module.h" +#include "linux/proc_fs.h" +#include "asm/irq.h" +#include "asm/uaccess.h" +#include "user_util.h" +#include "kern_util.h" +#include "kern.h" +#include "mconsole.h" +#include "mconsole_kern.h" +#include "irq_user.h" +#include "init.h" +#include "os.h" +#include "umid.h" +#include "irq_kern.h" + +static int do_unlink_socket(struct notifier_block *notifier, + unsigned long what, void *data) +{ + return(mconsole_unlink_socket()); +} + + +static struct notifier_block reboot_notifier = { + .notifier_call = do_unlink_socket, + .priority = 0, +}; + +/* Safe without explicit locking for now. Tasklets provide their own + * locking, and the interrupt handler is safe because it can't interrupt + * itself and it can only happen on CPU 0. + */ + +LIST_HEAD(mc_requests); + +void mc_work_proc(void *unused) +{ + struct mconsole_entry *req; + unsigned long flags; + int done; + + do { + local_save_flags(flags); + req = list_entry(mc_requests.next, struct mconsole_entry, + list); + list_del(&req->list); + done = list_empty(&mc_requests); + local_irq_restore(flags); + req->request.cmd->handler(&req->request); + kfree(req); + } while(!done); +} + +DECLARE_WORK(mconsole_work, mc_work_proc, NULL); + +irqreturn_t mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int fd; + struct mconsole_entry *new; + struct mc_request req; + + fd = (int) dev_id; + while (mconsole_get_request(fd, &req)){ + if(req.cmd->as_interrupt) (*req.cmd->handler)(&req); + else { + new = kmalloc(sizeof(req), GFP_ATOMIC); + if(new == NULL) + mconsole_reply(&req, "Out of memory", 1, 0); + else { + new->request = req; + list_add(&new->list, &mc_requests); + } + } + } + if(!list_empty(&mc_requests)) schedule_work(&mconsole_work); + reactivate_fd(fd, MCONSOLE_IRQ); + return(IRQ_HANDLED); +} + +void mconsole_version(struct mc_request *req) +{ + char version[256]; + + sprintf(version, "%s %s %s %s %s", system_utsname.sysname, + system_utsname.nodename, system_utsname.release, + system_utsname.version, system_utsname.machine); + mconsole_reply(req, version, 0, 0); +} + +void mconsole_log(struct mc_request *req) +{ + int len; + char *ptr = req->request.data; + + ptr += strlen("log"); + while(isspace(*ptr)) ptr++; + + len = ptr - req->request.data; + printk("%.*s", len, ptr); + mconsole_reply(req, "", 0, 0); +} + +#define UML_MCONSOLE_HELPTEXT \ +"Commands: \n\ + version - Get kernel version \n\ + help - Print this message \n\ + halt - Halt UML \n\ + reboot - Reboot UML \n\ + config = - Add a new device to UML; \n\ + same syntax as command line \n\ + config - Query the configuration of a device \n\ + remove - Remove a device from UML \n\ + sysrq - Performs the SysRq action controlled by the letter \n\ + cad - invoke the Ctl-Alt-Del handler \n\ + stop - pause the UML; it will do nothing until it receives a 'go' \n\ + go - continue the UML after a 'stop' \n\ + log - make UML enter into the kernel log\n\ +" + +void mconsole_help(struct mc_request *req) +{ + mconsole_reply(req, UML_MCONSOLE_HELPTEXT, 0, 0); +} + +void mconsole_halt(struct mc_request *req) +{ + mconsole_reply(req, "", 0, 0); + machine_halt(); +} + +void mconsole_reboot(struct mc_request *req) +{ + mconsole_reply(req, "", 0, 0); + machine_restart(NULL); +} + +extern void ctrl_alt_del(void); + +void mconsole_cad(struct mc_request *req) +{ + mconsole_reply(req, "", 0, 0); + ctrl_alt_del(); +} + +void mconsole_go(struct mc_request *req) +{ + mconsole_reply(req, "Not stopped", 1, 0); +} + +void mconsole_stop(struct mc_request *req) +{ + deactivate_fd(req->originating_fd, MCONSOLE_IRQ); + os_set_fd_block(req->originating_fd, 1); + mconsole_reply(req, "", 0, 0); + while(mconsole_get_request(req->originating_fd, req)){ + if(req->cmd->handler == mconsole_go) break; + (*req->cmd->handler)(req); + } + os_set_fd_block(req->originating_fd, 0); + reactivate_fd(req->originating_fd, MCONSOLE_IRQ); + mconsole_reply(req, "", 0, 0); +} + +/* This list is populated by __initcall routines. */ + +LIST_HEAD(mconsole_devices); + +void mconsole_register_dev(struct mc_device *new) +{ + list_add(&new->list, &mconsole_devices); +} + +static struct mc_device *mconsole_find_dev(char *name) +{ + struct list_head *ele; + struct mc_device *dev; + + list_for_each(ele, &mconsole_devices){ + dev = list_entry(ele, struct mc_device, list); + if(!strncmp(name, dev->name, strlen(dev->name))) + return(dev); + } + return(NULL); +} + +#define CONFIG_BUF_SIZE 64 + +static void mconsole_get_config(int (*get_config)(char *, char *, int, + char **), + struct mc_request *req, char *name) +{ + char default_buf[CONFIG_BUF_SIZE], *error, *buf; + int n, size; + + if(get_config == NULL){ + mconsole_reply(req, "No get_config routine defined", 1, 0); + return; + } + + error = NULL; + size = sizeof(default_buf)/sizeof(default_buf[0]); + buf = default_buf; + + while(1){ + n = (*get_config)(name, buf, size, &error); + if(error != NULL){ + mconsole_reply(req, error, 1, 0); + goto out; + } + + if(n <= size){ + mconsole_reply(req, buf, 0, 0); + goto out; + } + + if(buf != default_buf) + kfree(buf); + + size = n; + buf = kmalloc(size, GFP_KERNEL); + if(buf == NULL){ + mconsole_reply(req, "Failed to allocate buffer", 1, 0); + return; + } + } + out: + if(buf != default_buf) + kfree(buf); + +} + +void mconsole_config(struct mc_request *req) +{ + struct mc_device *dev; + char *ptr = req->request.data, *name; + int err; + + ptr += strlen("config"); + while(isspace(*ptr)) ptr++; + dev = mconsole_find_dev(ptr); + if(dev == NULL){ + mconsole_reply(req, "Bad configuration option", 1, 0); + return; + } + + name = &ptr[strlen(dev->name)]; + ptr = name; + while((*ptr != '=') && (*ptr != '\0')) + ptr++; + + if(*ptr == '='){ + err = (*dev->config)(name); + mconsole_reply(req, "", err, 0); + } + else mconsole_get_config(dev->get_config, req, name); +} + +void mconsole_remove(struct mc_request *req) +{ + struct mc_device *dev; + char *ptr = req->request.data; + int err; + + ptr += strlen("remove"); + while(isspace(*ptr)) ptr++; + dev = mconsole_find_dev(ptr); + if(dev == NULL){ + mconsole_reply(req, "Bad remove option", 1, 0); + return; + } + err = (*dev->remove)(&ptr[strlen(dev->name)]); + mconsole_reply(req, "", err, 0); +} + +#ifdef CONFIG_MAGIC_SYSRQ +void mconsole_sysrq(struct mc_request *req) +{ + char *ptr = req->request.data; + + ptr += strlen("sysrq"); + while(isspace(*ptr)) ptr++; + + handle_sysrq(*ptr, ¤t->thread.regs, NULL); + mconsole_reply(req, "", 0, 0); +} +#else +void mconsole_sysrq(struct mc_request *req) +{ + mconsole_reply(req, "Sysrq not compiled in", 1, 0); +} +#endif + +/* Changed by mconsole_setup, which is __setup, and called before SMP is + * active. + */ +static char *notify_socket = NULL; + +int mconsole_init(void) +{ + int err, sock; + char file[256]; + + if(umid_file_name("mconsole", file, sizeof(file))) return(-1); + snprintf(mconsole_socket_name, sizeof(file), "%s", file); + + sock = create_unix_socket(file, sizeof(file), 1); + if (sock < 0){ + printk("Failed to initialize management console\n"); + return(1); + } + + register_reboot_notifier(&reboot_notifier); + + err = um_request_irq(MCONSOLE_IRQ, sock, IRQ_READ, mconsole_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, + "mconsole", (void *)sock); + if (err){ + printk("Failed to get IRQ for management console\n"); + return(1); + } + + if(notify_socket != NULL){ + notify_socket = uml_strdup(notify_socket); + if(notify_socket != NULL) + mconsole_notify(notify_socket, MCONSOLE_SOCKET, + mconsole_socket_name, + strlen(mconsole_socket_name) + 1); + else printk(KERN_ERR "mconsole_setup failed to strdup " + "string\n"); + } + + printk("mconsole (version %d) initialized on %s\n", + MCONSOLE_VERSION, mconsole_socket_name); + return(0); +} + +__initcall(mconsole_init); + +static int write_proc_mconsole(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char *buf; + + buf = kmalloc(count + 1, GFP_KERNEL); + if(buf == NULL) + return(-ENOMEM); + + if(copy_from_user(buf, buffer, count)) + return(-EFAULT); + buf[count] = '\0'; + + mconsole_notify(notify_socket, MCONSOLE_USER_NOTIFY, buf, count); + return(count); +} + +static int create_proc_mconsole(void) +{ + struct proc_dir_entry *ent; + + if(notify_socket == NULL) return(0); + + ent = create_proc_entry("mconsole", S_IFREG | 0200, NULL); + if(ent == NULL){ + printk("create_proc_mconsole : create_proc_entry failed\n"); + return(0); + } + + ent->read_proc = NULL; + ent->write_proc = write_proc_mconsole; + return(0); +} + +static spinlock_t notify_spinlock = SPIN_LOCK_UNLOCKED; + +void lock_notify(void) +{ + spin_lock(¬ify_spinlock); +} + +void unlock_notify(void) +{ + spin_unlock(¬ify_spinlock); +} + +__initcall(create_proc_mconsole); + +#define NOTIFY "=notify:" + +static int mconsole_setup(char *str) +{ + if(!strncmp(str, NOTIFY, strlen(NOTIFY))){ + str += strlen(NOTIFY); + notify_socket = str; + } + else printk(KERN_ERR "mconsole_setup : Unknown option - '%s'\n", str); + return(1); +} + +__setup("mconsole", mconsole_setup); + +__uml_help(mconsole_setup, +"mconsole=notify:\n" +" Requests that the mconsole driver send a message to the named Unix\n" +" socket containing the name of the mconsole socket. This also serves\n" +" to notify outside processes when UML has booted far enough to respond\n" +" to mconsole requests.\n\n" +); + +static int notify_panic(struct notifier_block *self, unsigned long unused1, + void *ptr) +{ + char *message = ptr; + + if(notify_socket == NULL) return(0); + + mconsole_notify(notify_socket, MCONSOLE_PANIC, message, + strlen(message) + 1); + return(0); +} + +static struct notifier_block panic_exit_notifier = { + .notifier_call = notify_panic, + .next = NULL, + .priority = 1 +}; + +static int add_notifier(void) +{ + notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); + return(0); +} + +__initcall(add_notifier); + +char *mconsole_notify_socket(void) +{ + return(notify_socket); +} + +EXPORT_SYMBOL(mconsole_notify_socket); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/mconsole_kern.c~uml-summa.diff b/arch/um/drivers/mconsole_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/mconsole_kern.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,450 @@ +/* + * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/slab.h" +#include "linux/init.h" +#include "linux/notifier.h" +#include "linux/reboot.h" +#include "linux/utsname.h" +#include "linux/ctype.h" +#include "linux/interrupt.h" +#include "linux/sysrq.h" +#include "linux/workqueue.h" +#include "linux/module.h" +#include "linux/proc_fs.h" +#include "asm/irq.h" +#include "asm/uaccess.h" +#include "user_util.h" +#include "kern_util.h" +#include "kern.h" +#include "mconsole.h" +#include "mconsole_kern.h" +#include "irq_user.h" +#include "init.h" +#include "os.h" +#include "umid.h" + +static int do_unlink_socket(struct notifier_block *notifier, + unsigned long what, void *data) +{ + return(mconsole_unlink_socket()); +} + + +static struct notifier_block reboot_notifier = { + .notifier_call = do_unlink_socket, + .priority = 0, +}; + +/* Safe without explicit locking for now. Tasklets provide their own + * locking, and the interrupt handler is safe because it can't interrupt + * itself and it can only happen on CPU 0. + */ + +LIST_HEAD(mc_requests); + +void mc_work_proc(void *unused) +{ + struct mconsole_entry *req; + unsigned long flags; + int done; + + do { + local_save_flags(flags); + req = list_entry(mc_requests.next, struct mconsole_entry, + list); + list_del(&req->list); + done = list_empty(&mc_requests); + local_irq_restore(flags); + req->request.cmd->handler(&req->request); + kfree(req); + } while(!done); +} + +DECLARE_WORK(mconsole_work, mc_work_proc, NULL); + +void mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + int fd; + struct mconsole_entry *new; + struct mc_request req; + + fd = (int) dev_id; + while (mconsole_get_request(fd, &req)){ + if(req.cmd->as_interrupt) (*req.cmd->handler)(&req); + else { + new = kmalloc(sizeof(req), GFP_ATOMIC); + if(new == NULL) + mconsole_reply(&req, "Out of memory", 1, 0); + else { + new->request = req; + list_add(&new->list, &mc_requests); + } + } + } + if(!list_empty(&mc_requests)) schedule_work(&mconsole_work); + reactivate_fd(fd, MCONSOLE_IRQ); +} + +void mconsole_version(struct mc_request *req) +{ + char version[256]; + + sprintf(version, "%s %s %s %s %s", system_utsname.sysname, + system_utsname.nodename, system_utsname.release, + system_utsname.version, system_utsname.machine); + mconsole_reply(req, version, 0, 0); +} + +#define UML_MCONSOLE_HELPTEXT \ +"Commands: + version - Get kernel version + help - Print this message + halt - Halt UML + reboot - Reboot UML + config = - Add a new device to UML; + same syntax as command line + config - Query the configuration of a device + remove - Remove a device from UML + sysrq - Performs the SysRq action controlled by the letter + cad - invoke the Ctl-Alt-Del handler + stop - pause the UML; it will do nothing until it receives a 'go' + go - continue the UML after a 'stop' +" + +void mconsole_help(struct mc_request *req) +{ + mconsole_reply(req, UML_MCONSOLE_HELPTEXT, 0, 0); +} + +void mconsole_halt(struct mc_request *req) +{ + mconsole_reply(req, "", 0, 0); + machine_halt(); +} + +void mconsole_reboot(struct mc_request *req) +{ + mconsole_reply(req, "", 0, 0); + machine_restart(NULL); +} + +extern void ctrl_alt_del(void); + +void mconsole_cad(struct mc_request *req) +{ + mconsole_reply(req, "", 0, 0); + ctrl_alt_del(); +} + +void mconsole_go(struct mc_request *req) +{ + mconsole_reply(req, "Not stopped", 1, 0); +} + +void mconsole_stop(struct mc_request *req) +{ + deactivate_fd(req->originating_fd, MCONSOLE_IRQ); + os_set_fd_block(req->originating_fd, 1); + mconsole_reply(req, "", 0, 0); + while(mconsole_get_request(req->originating_fd, req)){ + if(req->cmd->handler == mconsole_go) break; + (*req->cmd->handler)(req); + } + os_set_fd_block(req->originating_fd, 0); + reactivate_fd(req->originating_fd, MCONSOLE_IRQ); + mconsole_reply(req, "", 0, 0); +} + +/* This list is populated by __initcall routines. */ + +LIST_HEAD(mconsole_devices); + +void mconsole_register_dev(struct mc_device *new) +{ + list_add(&new->list, &mconsole_devices); +} + +static struct mc_device *mconsole_find_dev(char *name) +{ + struct list_head *ele; + struct mc_device *dev; + + list_for_each(ele, &mconsole_devices){ + dev = list_entry(ele, struct mc_device, list); + if(!strncmp(name, dev->name, strlen(dev->name))) + return(dev); + } + return(NULL); +} + +#define CONFIG_BUF_SIZE 64 + +static void mconsole_get_config(int (*get_config)(char *, char *, int, + char **), + struct mc_request *req, char *name) +{ + char default_buf[CONFIG_BUF_SIZE], *error, *buf; + int n, size; + + if(get_config == NULL){ + mconsole_reply(req, "No get_config routine defined", 1, 0); + return; + } + + error = NULL; + size = sizeof(default_buf)/sizeof(default_buf[0]); + buf = default_buf; + + while(1){ + n = (*get_config)(name, buf, size, &error); + if(error != NULL){ + mconsole_reply(req, error, 1, 0); + goto out; + } + + if(n <= size){ + mconsole_reply(req, buf, 0, 0); + goto out; + } + + if(buf != default_buf) + kfree(buf); + + size = n; + buf = kmalloc(size, GFP_KERNEL); + if(buf == NULL){ + mconsole_reply(req, "Failed to allocate buffer", 1, 0); + return; + } + } + out: + if(buf != default_buf) + kfree(buf); + +} + +void mconsole_config(struct mc_request *req) +{ + struct mc_device *dev; + char *ptr = req->request.data, *name; + int err; + + ptr += strlen("config"); + while(isspace(*ptr)) ptr++; + dev = mconsole_find_dev(ptr); + if(dev == NULL){ + mconsole_reply(req, "Bad configuration option", 1, 0); + return; + } + + name = &ptr[strlen(dev->name)]; + ptr = name; + while((*ptr != '=') && (*ptr != '\0')) + ptr++; + + if(*ptr == '='){ + err = (*dev->config)(name); + mconsole_reply(req, "", err, 0); + } + else mconsole_get_config(dev->get_config, req, name); +} + +void mconsole_remove(struct mc_request *req) +{ + struct mc_device *dev; + char *ptr = req->request.data; + int err; + + ptr += strlen("remove"); + while(isspace(*ptr)) ptr++; + dev = mconsole_find_dev(ptr); + if(dev == NULL){ + mconsole_reply(req, "Bad remove option", 1, 0); + return; + } + err = (*dev->remove)(&ptr[strlen(dev->name)]); + mconsole_reply(req, "", err, 0); +} + +#ifdef CONFIG_MAGIC_SYSRQ +void mconsole_sysrq(struct mc_request *req) +{ + char *ptr = req->request.data; + + ptr += strlen("sysrq"); + while(isspace(*ptr)) ptr++; + + handle_sysrq(*ptr, ¤t->thread.regs, NULL); + mconsole_reply(req, "", 0, 0); +} +#else +void mconsole_sysrq(struct mc_request *req) +{ + mconsole_reply(req, "Sysrq not compiled in", 1, 0); +} +#endif + +/* Changed by mconsole_setup, which is __setup, and called before SMP is + * active. + */ +static char *notify_socket = NULL; + +int mconsole_init(void) +{ + int err, sock; + char file[256]; + + if(umid_file_name("mconsole", file, sizeof(file))) return(-1); + snprintf(mconsole_socket_name, sizeof(file), "%s", file); + + sock = create_unix_socket(file, sizeof(file)); + if (sock < 0){ + printk("Failed to initialize management console\n"); + return(1); + } + + register_reboot_notifier(&reboot_notifier); + + err = um_request_irq(MCONSOLE_IRQ, sock, IRQ_READ, mconsole_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, + "mconsole", (void *)sock); + if (err){ + printk("Failed to get IRQ for management console\n"); + return(1); + } + + if(notify_socket != NULL){ + notify_socket = uml_strdup(notify_socket); + if(notify_socket != NULL) + mconsole_notify(notify_socket, MCONSOLE_SOCKET, + mconsole_socket_name, + strlen(mconsole_socket_name) + 1); + else printk(KERN_ERR "mconsole_setup failed to strdup " + "string\n"); + } + + printk("mconsole (version %d) initialized on %s\n", + MCONSOLE_VERSION, mconsole_socket_name); + return(0); +} + +__initcall(mconsole_init); + +static int write_proc_mconsole(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char *buf; + + buf = kmalloc(count + 1, GFP_KERNEL); + if(buf == NULL) + return(-ENOMEM); + + if(copy_from_user(buf, buffer, count)) + return(-EFAULT); + buf[count] = '\0'; + + mconsole_notify(notify_socket, MCONSOLE_USER_NOTIFY, buf, count); + return(count); +} + +static int create_proc_mconsole(void) +{ + struct proc_dir_entry *ent; + + if(notify_socket == NULL) return(0); + + ent = create_proc_entry("mconsole", S_IFREG | 0200, NULL); + if(ent == NULL){ + printk("create_proc_mconsole : create_proc_entry failed\n"); + return(0); + } + + ent->read_proc = NULL; + ent->write_proc = write_proc_mconsole; + return(0); +} + +static spinlock_t notify_spinlock = SPIN_LOCK_UNLOCKED; + +void lock_notify(void) +{ + spin_lock(¬ify_spinlock); +} + +void unlock_notify(void) +{ + spin_unlock(¬ify_spinlock); +} + +__initcall(create_proc_mconsole); + +#define NOTIFY "=notify:" + +static int mconsole_setup(char *str) +{ + if(!strncmp(str, NOTIFY, strlen(NOTIFY))){ + str += strlen(NOTIFY); + notify_socket = str; + } + else printk(KERN_ERR "mconsole_setup : Unknown option - '%s'\n", str); + return(1); +} + +__setup("mconsole", mconsole_setup); + +__uml_help(mconsole_setup, +"mconsole=notify:\n" +" Requests that the mconsole driver send a message to the named Unix\n" +" socket containing the name of the mconsole socket. This also serves\n" +" to notify outside processes when UML has booted far enough to respond\n" +" to mconsole requests.\n\n" +); + +static int notify_panic(struct notifier_block *self, unsigned long unused1, + void *ptr) +{ + char *message = ptr; + + if(notify_socket == NULL) return(0); + + mconsole_notify(notify_socket, MCONSOLE_PANIC, message, + strlen(message) + 1); + return(0); +} + +static struct notifier_block panic_exit_notifier = { + .notifier_call = notify_panic, + .next = NULL, + .priority = 1 +}; + +static int add_notifier(void) +{ + notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); + return(0); +} + +__initcall(add_notifier); + +char *mconsole_notify_socket(void) +{ + return(notify_socket); +} + +EXPORT_SYMBOL(mconsole_notify_socket); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/mconsole_user.c b/arch/um/drivers/mconsole_user.c --- a/arch/um/drivers/mconsole_user.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/mconsole_user.c Fri Oct 31 14:10:53 2003 @@ -28,6 +28,7 @@ { "cad", mconsole_cad, 1 }, { "stop", mconsole_stop, 0 }, { "go", mconsole_go, 1 }, + { "log", mconsole_log, 1 }, }; /* Initialized in mconsole_init, which is an initcall */ @@ -139,6 +140,7 @@ memcpy(reply.data, str, len); reply.data[len] = '\0'; total -= len; + str += len; reply.len = len + 1; len = sizeof(reply) + reply.len - sizeof(reply.data); diff -Nru a/arch/um/drivers/mconsole_user.c~uml-summa.diff b/arch/um/drivers/mconsole_user.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/mconsole_user.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "user.h" +#include "mconsole.h" +#include "umid.h" + +static struct mconsole_command commands[] = { + { "version", mconsole_version, 1 }, + { "halt", mconsole_halt, 0 }, + { "reboot", mconsole_reboot, 0 }, + { "config", mconsole_config, 0 }, + { "remove", mconsole_remove, 0 }, + { "sysrq", mconsole_sysrq, 1 }, + { "help", mconsole_help, 1 }, + { "cad", mconsole_cad, 1 }, + { "stop", mconsole_stop, 0 }, + { "go", mconsole_go, 1 }, +}; + +/* Initialized in mconsole_init, which is an initcall */ +char mconsole_socket_name[256]; + +int mconsole_reply_v0(struct mc_request *req, char *reply) +{ + struct iovec iov; + struct msghdr msg; + + iov.iov_base = reply; + iov.iov_len = strlen(reply); + + msg.msg_name = &(req->origin); + msg.msg_namelen = req->originlen; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + return sendmsg(req->originating_fd, &msg, 0); +} + +static struct mconsole_command *mconsole_parse(struct mc_request *req) +{ + struct mconsole_command *cmd; + int i; + + for(i=0;irequest.data, cmd->command, + strlen(cmd->command))){ + return(cmd); + } + } + return(NULL); +} + +#define MIN(a,b) ((a)<(b) ? (a):(b)) + +#define STRINGX(x) #x +#define STRING(x) STRINGX(x) + +int mconsole_get_request(int fd, struct mc_request *req) +{ + int len; + + req->originlen = sizeof(req->origin); + req->len = recvfrom(fd, &req->request, sizeof(req->request), 0, + (struct sockaddr *) req->origin, &req->originlen); + if (req->len < 0) + return 0; + + req->originating_fd = fd; + + if(req->request.magic != MCONSOLE_MAGIC){ + /* Unversioned request */ + len = MIN(sizeof(req->request.data) - 1, + strlen((char *) &req->request)); + memmove(req->request.data, &req->request, len); + req->request.data[len] = '\0'; + + req->request.magic = MCONSOLE_MAGIC; + req->request.version = 0; + req->request.len = len; + + mconsole_reply_v0(req, "ERR Version 0 mconsole clients are " + "not supported by this driver"); + return(0); + } + + if(req->request.len >= MCONSOLE_MAX_DATA){ + mconsole_reply(req, "Request too large", 1, 0); + return(0); + } + if(req->request.version != MCONSOLE_VERSION){ + mconsole_reply(req, "This driver only supports version " + STRING(MCONSOLE_VERSION) " clients", 1, 0); + } + + req->request.data[req->request.len] = '\0'; + req->cmd = mconsole_parse(req); + if(req->cmd == NULL){ + mconsole_reply(req, "Unknown command", 1, 0); + return(0); + } + + return(1); +} + +int mconsole_reply(struct mc_request *req, char *str, int err, int more) +{ + struct mconsole_reply reply; + int total, len, n; + + total = strlen(str); + do { + reply.err = err; + + /* err can only be true on the first packet */ + err = 0; + + len = MIN(total, MCONSOLE_MAX_DATA - 1); + + if(len == total) reply.more = more; + else reply.more = 1; + + memcpy(reply.data, str, len); + reply.data[len] = '\0'; + total -= len; + reply.len = len + 1; + + len = sizeof(reply) + reply.len - sizeof(reply.data); + + n = sendto(req->originating_fd, &reply, len, 0, + (struct sockaddr *) req->origin, req->originlen); + + if(n < 0) return(-errno); + } while(total > 0); + return(0); +} + +int mconsole_unlink_socket(void) +{ + unlink(mconsole_socket_name); + return 0; +} + +static int notify_sock = -1; + +int mconsole_notify(char *sock_name, int type, const void *data, int len) +{ + struct sockaddr_un target; + struct mconsole_notify packet; + int n, err = 0; + + lock_notify(); + if(notify_sock < 0){ + notify_sock = socket(PF_UNIX, SOCK_DGRAM, 0); + if(notify_sock < 0){ + printk("mconsole_notify - socket failed, errno = %d\n", + errno); + err = -errno; + } + } + unlock_notify(); + + if(err) + return(err); + + target.sun_family = AF_UNIX; + strcpy(target.sun_path, sock_name); + + packet.magic = MCONSOLE_MAGIC; + packet.version = MCONSOLE_VERSION; + packet.type = type; + len = (len > sizeof(packet.data)) ? sizeof(packet.data) : len; + packet.len = len; + memcpy(packet.data, data, len); + + err = 0; + len = sizeof(packet) + packet.len - sizeof(packet.data); + n = sendto(notify_sock, &packet, len, 0, (struct sockaddr *) &target, + sizeof(target)); + if(n < 0){ + printk("mconsole_notify - sendto failed, errno = %d\n", errno); + err = -errno; + } + return(err); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/mmapper_kern.c b/arch/um/drivers/mmapper_kern.c --- a/arch/um/drivers/mmapper_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/mmapper_kern.c Fri Oct 31 14:10:53 2003 @@ -120,7 +120,10 @@ printk(KERN_INFO "Mapper v0.1\n"); v_buf = (char *) find_iomem("mmapper", &mmapper_size); - if(mmapper_size == 0) return(0); + if(mmapper_size == 0){ + printk(KERN_ERR "mmapper_init - find_iomem failed\n"); + return(0); + } p_buf = __pa(v_buf); diff -Nru a/arch/um/drivers/mmapper_kern.c~uml-summa.diff b/arch/um/drivers/mmapper_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/mmapper_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,146 @@ +/* + * arch/um/drivers/mmapper_kern.c + * + * BRIEF MODULE DESCRIPTION + * + * Copyright (C) 2000 RidgeRun, Inc. + * Author: RidgeRun, Inc. + * Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mem_user.h" +#include "user_util.h" + +/* These are set in mmapper_init, which is called at boot time */ +static unsigned long mmapper_size; +static unsigned long p_buf = 0; +static char *v_buf = NULL; + +static ssize_t +mmapper_read(struct file *file, char *buf, size_t count, loff_t *ppos) +{ + if(*ppos > mmapper_size) + return -EINVAL; + + if(count + *ppos > mmapper_size) + count = count + *ppos - mmapper_size; + + if(count < 0) + return -EINVAL; + + copy_to_user(buf,&v_buf[*ppos],count); + + return count; +} + +static ssize_t +mmapper_write(struct file *file, const char *buf, size_t count, loff_t *ppos) +{ + if(*ppos > mmapper_size) + return -EINVAL; + + if(count + *ppos > mmapper_size) + count = count + *ppos - mmapper_size; + + if(count < 0) + return -EINVAL; + + copy_from_user(&v_buf[*ppos],buf,count); + + return count; +} + +static int +mmapper_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + return(-ENOIOCTLCMD); +} + +static int +mmapper_mmap(struct file *file, struct vm_area_struct * vma) +{ + int ret = -EINVAL; + int size; + + lock_kernel(); + if (vma->vm_pgoff != 0) + goto out; + + size = vma->vm_end - vma->vm_start; + if(size > mmapper_size) return(-EFAULT); + + /* XXX A comment above remap_page_range says it should only be + * called when the mm semaphore is held + */ + if (remap_page_range(vma, vma->vm_start, p_buf, size, + vma->vm_page_prot)) + goto out; + ret = 0; +out: + unlock_kernel(); + return ret; +} + +static int +mmapper_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int +mmapper_release(struct inode *inode, struct file *file) +{ + return 0; +} + +static struct file_operations mmapper_fops = { + .owner = THIS_MODULE, + .read = mmapper_read, + .write = mmapper_write, + .ioctl = mmapper_ioctl, + .mmap = mmapper_mmap, + .open = mmapper_open, + .release = mmapper_release, +}; + +static int __init mmapper_init(void) +{ + printk(KERN_INFO "Mapper v0.1\n"); + + v_buf = (char *) find_iomem("mmapper", &mmapper_size); + if(mmapper_size == 0) return(0); + + p_buf = __pa(v_buf); + + devfs_mk_cdev(MKDEV(30, 0), S_IFCHR|S_IRUGO|S_IWUGO, "mmapper"); + devfs_mk_symlink("mmapper0", "mmapper"); + return(0); +} + +static void mmapper_exit(void) +{ +} + +module_init(mmapper_init); +module_exit(mmapper_exit); + +MODULE_AUTHOR("Greg Lonnon "); +MODULE_DESCRIPTION("DSPLinux simulator mmapper driver"); +/* + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c --- a/arch/um/drivers/net_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/net_kern.c Fri Oct 31 14:10:53 2003 @@ -61,14 +61,14 @@ return pkt_len; } -void uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) { struct net_device *dev = dev_id; struct uml_net_private *lp = dev->priv; int err; if(!netif_running(dev)) - return; + return(IRQ_NONE); spin_lock(&lp->lock); while((err = uml_net_rx(dev)) > 0) ; @@ -83,6 +83,7 @@ out: spin_unlock(&lp->lock); + return(IRQ_HANDLED); } static int uml_net_open(struct net_device *dev) @@ -252,37 +253,6 @@ #endif } -/* - * default do nothing hard header packet routines for struct net_device init. - * real ethernet transports will overwrite with real routines. - */ -static int uml_net_hard_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, void *daddr, void *saddr, unsigned len) -{ - return(0); /* no change */ -} - -static int uml_net_rebuild_header(struct sk_buff *skb) -{ - return(0); /* ignore */ -} - -static int uml_net_header_cache(struct neighbour *neigh, struct hh_cache *hh) -{ - return(-1); /* fail */ -} - -static void uml_net_header_cache_update(struct hh_cache *hh, - struct net_device *dev, unsigned char * haddr) -{ - /* ignore */ -} - -static int uml_net_header_parse(struct sk_buff *skb, unsigned char *haddr) -{ - return(0); /* nothing */ -} - static spinlock_t devices_lock = SPIN_LOCK_UNLOCKED; static struct list_head devices = LIST_HEAD_INIT(devices); @@ -292,7 +262,7 @@ struct uml_net *device; struct net_device *dev; struct uml_net_private *lp; - int err, size; + int save, err, size; size = transport->private_size + sizeof(struct uml_net_private) + sizeof(((struct uml_net_private *) 0)->user); @@ -334,12 +304,6 @@ snprintf(dev->name, sizeof(dev->name), "eth%d", n); device->dev = dev; - dev->hard_header = uml_net_hard_header; - dev->rebuild_header = uml_net_rebuild_header; - dev->hard_header_cache = uml_net_header_cache; - dev->header_cache_update= uml_net_header_cache_update; - dev->hard_header_parse = uml_net_header_parse; - (*transport->kern->init)(dev, init); dev->mtu = transport->user->max_packet; @@ -362,21 +326,29 @@ return 1; lp = dev->priv; - INIT_LIST_HEAD(&lp->list); - spin_lock_init(&lp->lock); - lp->dev = dev; - lp->fd = -1; - lp->mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0 }; - lp->have_mac = device->have_mac; - lp->protocol = transport->kern->protocol; - lp->open = transport->user->open; - lp->close = transport->user->close; - lp->remove = transport->user->remove; - lp->read = transport->kern->read; - lp->write = transport->kern->write; - lp->add_address = transport->user->add_address; - lp->delete_address = transport->user->delete_address; - lp->set_mtu = transport->user->set_mtu; + /* lp.user is the first four bytes of the transport data, which + * has already been initialized. This structure assignment will + * overwrite that, so we make sure that .user gets overwritten with + * what it already has. + */ + save = lp->user[0]; + *lp = ((struct uml_net_private) + { .list = LIST_HEAD_INIT(lp->list), + .lock = SPIN_LOCK_UNLOCKED, + .dev = dev, + .fd = -1, + .mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0}, + .have_mac = device->have_mac, + .protocol = transport->kern->protocol, + .open = transport->user->open, + .close = transport->user->close, + .remove = transport->user->remove, + .read = transport->kern->read, + .write = transport->kern->write, + .add_address = transport->user->add_address, + .delete_address = transport->user->delete_address, + .set_mtu = transport->user->set_mtu, + .user = { save } }); init_timer(&lp->tl); lp->tl.function = uml_net_user_timer_expire; @@ -609,7 +581,8 @@ unregister_netdev(dev); list_del(&device->list); - free_netdev(device); + kfree(device); + free_netdev(dev); return(0); } diff -Nru a/arch/um/drivers/net_kern.c~uml-kill-irq_kern.h.diff b/arch/um/drivers/net_kern.c~uml-kill-irq_kern.h.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/net_kern.c~uml-kill-irq_kern.h.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,840 @@ +/* + * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and + * James Leu (jleu@mindspring.net). + * Copyright (C) 2001 by various other people who didn't put their name here. + * Licensed under the GPL. + */ + +#include "linux/config.h" +#include "linux/kernel.h" +#include "linux/netdevice.h" +#include "linux/rtnetlink.h" +#include "linux/skbuff.h" +#include "linux/socket.h" +#include "linux/spinlock.h" +#include "linux/module.h" +#include "linux/init.h" +#include "linux/etherdevice.h" +#include "linux/list.h" +#include "linux/inetdevice.h" +#include "linux/ctype.h" +#include "linux/bootmem.h" +#include "user_util.h" +#include "kern_util.h" +#include "net_kern.h" +#include "net_user.h" +#include "mconsole_kern.h" +#include "init.h" +#include "irq_user.h" +#include "irq_kern.h" + +static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(opened); + +static int uml_net_rx(struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + int pkt_len; + struct sk_buff *skb; + + /* If we can't allocate memory, try again next round. */ + if ((skb = dev_alloc_skb(dev->mtu)) == NULL) { + lp->stats.rx_dropped++; + return 0; + } + + skb->dev = dev; + skb_put(skb, dev->mtu); + skb->mac.raw = skb->data; + pkt_len = (*lp->read)(lp->fd, &skb, lp); + + if (pkt_len > 0) { + skb_trim(skb, pkt_len); + skb->protocol = (*lp->protocol)(skb); + netif_rx(skb); + + lp->stats.rx_bytes += skb->len; + lp->stats.rx_packets++; + return pkt_len; + } + + kfree_skb(skb); + return pkt_len; +} + +irqreturn_t uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct net_device *dev = dev_id; + struct uml_net_private *lp = dev->priv; + int err; + + if(!netif_running(dev)) + return(IRQ_NONE); + + spin_lock(&lp->lock); + while((err = uml_net_rx(dev)) > 0) ; + if(err < 0) { + printk(KERN_ERR + "Device '%s' read returned %d, shutting it down\n", + dev->name, err); + dev_close(dev); + goto out; + } + reactivate_fd(lp->fd, UM_ETH_IRQ); + + out: + spin_unlock(&lp->lock); + return(IRQ_HANDLED); +} + +static int uml_net_open(struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + char addr[sizeof("255.255.255.255\0")]; + int err; + + spin_lock(&lp->lock); + + if(lp->fd >= 0){ + err = -ENXIO; + goto out; + } + + if(!lp->have_mac){ + dev_ip_addr(dev, addr, &lp->mac[2]); + set_ether_mac(dev, lp->mac); + } + + lp->fd = (*lp->open)(&lp->user); + if(lp->fd < 0){ + err = lp->fd; + goto out; + } + + err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt, + SA_INTERRUPT | SA_SHIRQ, dev->name, dev); + if(err != 0){ + printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err); + if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); + lp->fd = -1; + err = -ENETUNREACH; + } + + lp->tl.data = (unsigned long) &lp->user; + netif_start_queue(dev); + + spin_lock(&opened_lock); + list_add(&lp->list, &opened); + spin_unlock(&opened_lock); + MOD_INC_USE_COUNT; + out: + spin_unlock(&lp->lock); + return(err); +} + +static int uml_net_close(struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + + netif_stop_queue(dev); + spin_lock(&lp->lock); + + free_irq(dev->irq, dev); + if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); + lp->fd = -1; + spin_lock(&opened_lock); + list_del(&lp->list); + spin_unlock(&opened_lock); + + MOD_DEC_USE_COUNT; + spin_unlock(&lp->lock); + return 0; +} + +static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + unsigned long flags; + int len; + + netif_stop_queue(dev); + + spin_lock_irqsave(&lp->lock, flags); + + len = (*lp->write)(lp->fd, &skb, lp); + + if(len == skb->len) { + lp->stats.tx_packets++; + lp->stats.tx_bytes += skb->len; + dev->trans_start = jiffies; + netif_start_queue(dev); + + /* this is normally done in the interrupt when tx finishes */ + netif_wake_queue(dev); + } + else if(len == 0){ + netif_start_queue(dev); + lp->stats.tx_dropped++; + } + else { + netif_start_queue(dev); + printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len); + } + + spin_unlock_irqrestore(&lp->lock, flags); + + dev_kfree_skb(skb); + + return 0; +} + +static struct net_device_stats *uml_net_get_stats(struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + return &lp->stats; +} + +static void uml_net_set_multicast_list(struct net_device *dev) +{ + if (dev->flags & IFF_PROMISC) return; + else if (dev->mc_count) dev->flags |= IFF_ALLMULTI; + else dev->flags &= ~IFF_ALLMULTI; +} + +static void uml_net_tx_timeout(struct net_device *dev) +{ + dev->trans_start = jiffies; + netif_wake_queue(dev); +} + +static int uml_net_set_mac(struct net_device *dev, void *addr) +{ + struct uml_net_private *lp = dev->priv; + struct sockaddr *hwaddr = addr; + + spin_lock(&lp->lock); + memcpy(dev->dev_addr, hwaddr->sa_data, ETH_ALEN); + spin_unlock(&lp->lock); + + return(0); +} + +static int uml_net_change_mtu(struct net_device *dev, int new_mtu) +{ + struct uml_net_private *lp = dev->priv; + int err = 0; + + spin_lock(&lp->lock); + + new_mtu = (*lp->set_mtu)(new_mtu, &lp->user); + if(new_mtu < 0){ + err = new_mtu; + goto out; + } + + dev->mtu = new_mtu; + + out: + spin_unlock(&lp->lock); + return err; +} + +static int uml_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + return(-EINVAL); +} + +void uml_net_user_timer_expire(unsigned long _conn) +{ +#ifdef undef + struct connection *conn = (struct connection *)_conn; + + dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn); + do_connect(conn); +#endif +} + +static spinlock_t devices_lock = SPIN_LOCK_UNLOCKED; +static struct list_head devices = LIST_HEAD_INIT(devices); + +static int eth_configure(int n, void *init, char *mac, + struct transport *transport) +{ + struct uml_net *device; + struct net_device *dev; + struct uml_net_private *lp; + int save, err, size; + + size = transport->private_size + sizeof(struct uml_net_private) + + sizeof(((struct uml_net_private *) 0)->user); + + device = kmalloc(sizeof(*device), GFP_KERNEL); + if (device == NULL) { + printk(KERN_ERR "eth_configure failed to allocate uml_net\n"); + return(1); + } + + memset(device, 0, sizeof(*device)); + INIT_LIST_HEAD(&device->list); + device->index = n; + + spin_lock(&devices_lock); + list_add(&device->list, &devices); + spin_unlock(&devices_lock); + + if (setup_etheraddr(mac, device->mac)) + device->have_mac = 1; + + printk(KERN_INFO "Netdevice %d ", n); + if (device->have_mac) + printk("(%02x:%02x:%02x:%02x:%02x:%02x) ", + device->mac[0], device->mac[1], + device->mac[2], device->mac[3], + device->mac[4], device->mac[5]); + printk(": "); + dev = alloc_etherdev(size); + if (dev == NULL) { + printk(KERN_ERR "eth_configure: failed to allocate device\n"); + return 1; + } + + /* If this name ends up conflicting with an existing registered + * netdevice, that is OK, register_netdev{,ice}() will notice this + * and fail. + */ + snprintf(dev->name, sizeof(dev->name), "eth%d", n); + device->dev = dev; + + (*transport->kern->init)(dev, init); + + dev->mtu = transport->user->max_packet; + dev->open = uml_net_open; + dev->hard_start_xmit = uml_net_start_xmit; + dev->stop = uml_net_close; + dev->get_stats = uml_net_get_stats; + dev->set_multicast_list = uml_net_set_multicast_list; + dev->tx_timeout = uml_net_tx_timeout; + dev->set_mac_address = uml_net_set_mac; + dev->change_mtu = uml_net_change_mtu; + dev->do_ioctl = uml_net_ioctl; + dev->watchdog_timeo = (HZ >> 1); + dev->irq = UM_ETH_IRQ; + + rtnl_lock(); + err = register_netdevice(dev); + rtnl_unlock(); + if (err) + return 1; + lp = dev->priv; + + /* lp.user is the first four bytes of the transport data, which + * has already been initialized. This structure assignment will + * overwrite that, so we make sure that .user gets overwritten with + * what it already has. + */ + save = lp->user[0]; + *lp = ((struct uml_net_private) + { .list = LIST_HEAD_INIT(lp->list), + .lock = SPIN_LOCK_UNLOCKED, + .dev = dev, + .fd = -1, + .mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0}, + .have_mac = device->have_mac, + .protocol = transport->kern->protocol, + .open = transport->user->open, + .close = transport->user->close, + .remove = transport->user->remove, + .read = transport->kern->read, + .write = transport->kern->write, + .add_address = transport->user->add_address, + .delete_address = transport->user->delete_address, + .set_mtu = transport->user->set_mtu, + .user = { save } }); + + init_timer(&lp->tl); + lp->tl.function = uml_net_user_timer_expire; + if (lp->have_mac) + memcpy(lp->mac, device->mac, sizeof(lp->mac)); + + if (transport->user->init) + (*transport->user->init)(&lp->user, dev); + + if (device->have_mac) + set_ether_mac(dev, device->mac); + return(0); +} + +static struct uml_net *find_device(int n) +{ + struct uml_net *device; + struct list_head *ele; + + spin_lock(&devices_lock); + list_for_each(ele, &devices){ + device = list_entry(ele, struct uml_net, list); + if(device->index == n) + goto out; + } + device = NULL; + out: + spin_unlock(&devices_lock); + return(device); +} + +static int eth_parse(char *str, int *index_out, char **str_out) +{ + char *end; + int n; + + n = simple_strtoul(str, &end, 0); + if(end == str){ + printk(KERN_ERR "eth_setup: Failed to parse '%s'\n", str); + return(1); + } + if(n < 0){ + printk(KERN_ERR "eth_setup: device %d is negative\n", n); + return(1); + } + str = end; + if(*str != '='){ + printk(KERN_ERR + "eth_setup: expected '=' after device number\n"); + return(1); + } + str++; + if(find_device(n)){ + printk(KERN_ERR "eth_setup: Device %d already configured\n", + n); + return(1); + } + if(index_out) *index_out = n; + *str_out = str; + return(0); +} + +struct eth_init { + struct list_head list; + char *init; + int index; +}; + +/* Filled in at boot time. Will need locking if the transports become + * modular. + */ +struct list_head transports = LIST_HEAD_INIT(transports); + +/* Filled in during early boot */ +struct list_head eth_cmd_line = LIST_HEAD_INIT(eth_cmd_line); + +static int check_transport(struct transport *transport, char *eth, int n, + void **init_out, char **mac_out) +{ + int len; + + len = strlen(transport->name); + if(strncmp(eth, transport->name, len)) + return(0); + + eth += len; + if(*eth == ',') + eth++; + else if(*eth != '\0') + return(0); + + *init_out = kmalloc(transport->setup_size, GFP_KERNEL); + if(*init_out == NULL) + return(1); + + if(!transport->setup(eth, mac_out, *init_out)){ + kfree(*init_out); + *init_out = NULL; + } + return(1); +} + +void register_transport(struct transport *new) +{ + struct list_head *ele, *next; + struct eth_init *eth; + void *init; + char *mac = NULL; + int match; + + list_add(&new->list, &transports); + + list_for_each_safe(ele, next, ð_cmd_line){ + eth = list_entry(ele, struct eth_init, list); + match = check_transport(new, eth->init, eth->index, &init, + &mac); + if(!match) + continue; + else if(init != NULL){ + eth_configure(eth->index, init, mac, new); + kfree(init); + } + list_del(ð->list); + } +} + +static int eth_setup_common(char *str, int index) +{ + struct list_head *ele; + struct transport *transport; + void *init; + char *mac = NULL; + + list_for_each(ele, &transports){ + transport = list_entry(ele, struct transport, list); + if(!check_transport(transport, str, index, &init, &mac)) + continue; + if(init != NULL){ + eth_configure(index, init, mac, transport); + kfree(init); + } + return(1); + } + return(0); +} + +static int eth_setup(char *str) +{ + struct eth_init *new; + int n, err; + + err = eth_parse(str, &n, &str); + if(err) return(1); + + new = alloc_bootmem(sizeof(new)); + if (new == NULL){ + printk("eth_init : alloc_bootmem failed\n"); + return(1); + } + + INIT_LIST_HEAD(&new->list); + new->index = n; + new->init = str; + + list_add_tail(&new->list, ð_cmd_line); + return(1); +} + +__setup("eth", eth_setup); +__uml_help(eth_setup, +"eth[0-9]+=,\n" +" Configure a network device.\n\n" +); + +static int eth_init(void) +{ + struct list_head *ele, *next; + struct eth_init *eth; + + list_for_each_safe(ele, next, ð_cmd_line){ + eth = list_entry(ele, struct eth_init, list); + + if(eth_setup_common(eth->init, eth->index)) + list_del(ð->list); + } + + return(1); +} + +__initcall(eth_init); + +static int net_config(char *str) +{ + int n, err; + + err = eth_parse(str, &n, &str); + if(err) return(err); + + str = uml_strdup(str); + if(str == NULL){ + printk(KERN_ERR "net_config failed to strdup string\n"); + return(-1); + } + err = !eth_setup_common(str, n); + if(err) + kfree(str); + return(err); +} + +static int net_remove(char *str) +{ + struct uml_net *device; + struct net_device *dev; + struct uml_net_private *lp; + char *end; + int n; + + n = simple_strtoul(str, &end, 0); + if((*end != '\0') || (end == str)) + return(-1); + + device = find_device(n); + if(device == NULL) + return(0); + + dev = device->dev; + lp = dev->priv; + if(lp->fd > 0) return(-1); + if(lp->remove != NULL) (*lp->remove)(&lp->user); + unregister_netdev(dev); + + list_del(&device->list); + kfree(device); + free_netdev(dev); + return(0); +} + +static struct mc_device net_mc = { + .name = "eth", + .config = net_config, + .get_config = NULL, + .remove = net_remove, +}; + +static int uml_inetaddr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct in_ifaddr *ifa = ptr; + u32 addr = ifa->ifa_address; + u32 netmask = ifa->ifa_mask; + struct net_device *dev = ifa->ifa_dev->dev; + struct uml_net_private *lp; + void (*proc)(unsigned char *, unsigned char *, void *); + unsigned char addr_buf[4], netmask_buf[4]; + + if(dev->open != uml_net_open) return(NOTIFY_DONE); + + lp = dev->priv; + + proc = NULL; + switch (event){ + case NETDEV_UP: + proc = lp->add_address; + break; + case NETDEV_DOWN: + proc = lp->delete_address; + break; + } + if(proc != NULL){ + addr_buf[0] = addr & 0xff; + addr_buf[1] = (addr >> 8) & 0xff; + addr_buf[2] = (addr >> 16) & 0xff; + addr_buf[3] = addr >> 24; + netmask_buf[0] = netmask & 0xff; + netmask_buf[1] = (netmask >> 8) & 0xff; + netmask_buf[2] = (netmask >> 16) & 0xff; + netmask_buf[3] = netmask >> 24; + (*proc)(addr_buf, netmask_buf, &lp->user); + } + return(NOTIFY_DONE); +} + +struct notifier_block uml_inetaddr_notifier = { + .notifier_call = uml_inetaddr_event, +}; + +static int uml_net_init(void) +{ + struct list_head *ele; + struct uml_net_private *lp; + struct in_device *ip; + struct in_ifaddr *in; + + mconsole_register_dev(&net_mc); + register_inetaddr_notifier(¨_inetaddr_notifier); + + /* Devices may have been opened already, so the uml_inetaddr_notifier + * didn't get a chance to run for them. This fakes it so that + * addresses which have already been set up get handled properly. + */ + list_for_each(ele, &opened){ + lp = list_entry(ele, struct uml_net_private, list); + ip = lp->dev->ip_ptr; + if(ip == NULL) continue; + in = ip->ifa_list; + while(in != NULL){ + uml_inetaddr_event(NULL, NETDEV_UP, in); + in = in->ifa_next; + } + } + + return(0); +} + +__initcall(uml_net_init); + +static void close_devices(void) +{ + struct list_head *ele; + struct uml_net_private *lp; + + list_for_each(ele, &opened){ + lp = list_entry(ele, struct uml_net_private, list); + if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); + if(lp->remove != NULL) (*lp->remove)(&lp->user); + } +} + +__uml_exitcall(close_devices); + +int setup_etheraddr(char *str, unsigned char *addr) +{ + char *end; + int i; + + if(str == NULL) + return(0); + for(i=0;i<6;i++){ + addr[i] = simple_strtoul(str, &end, 16); + if((end == str) || + ((*end != ':') && (*end != ',') && (*end != '\0'))){ + printk(KERN_ERR + "setup_etheraddr: failed to parse '%s' " + "as an ethernet address\n", str); + return(0); + } + str = end + 1; + } + if(addr[0] & 1){ + printk(KERN_ERR + "Attempt to assign a broadcast ethernet address to a " + "device disallowed\n"); + return(0); + } + return(1); +} + +void dev_ip_addr(void *d, char *buf, char *bin_buf) +{ + struct net_device *dev = d; + struct in_device *ip = dev->ip_ptr; + struct in_ifaddr *in; + u32 addr; + + if((ip == NULL) || ((in = ip->ifa_list) == NULL)){ + printk(KERN_WARNING "dev_ip_addr - device not assigned an " + "IP address\n"); + return; + } + addr = in->ifa_address; + sprintf(buf, "%d.%d.%d.%d", addr & 0xff, (addr >> 8) & 0xff, + (addr >> 16) & 0xff, addr >> 24); + if(bin_buf){ + bin_buf[0] = addr & 0xff; + bin_buf[1] = (addr >> 8) & 0xff; + bin_buf[2] = (addr >> 16) & 0xff; + bin_buf[3] = addr >> 24; + } +} + +void set_ether_mac(void *d, unsigned char *addr) +{ + struct net_device *dev = d; + + memcpy(dev->dev_addr, addr, ETH_ALEN); +} + +struct sk_buff *ether_adjust_skb(struct sk_buff *skb, int extra) +{ + if((skb != NULL) && (skb_tailroom(skb) < extra)){ + struct sk_buff *skb2; + + skb2 = skb_copy_expand(skb, 0, extra, GFP_ATOMIC); + dev_kfree_skb(skb); + skb = skb2; + } + if(skb != NULL) skb_put(skb, extra); + return(skb); +} + +void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *, + void *), + void *arg) +{ + struct net_device *dev = d; + struct in_device *ip = dev->ip_ptr; + struct in_ifaddr *in; + unsigned char address[4], netmask[4]; + + if(ip == NULL) return; + in = ip->ifa_list; + while(in != NULL){ + address[0] = in->ifa_address & 0xff; + address[1] = (in->ifa_address >> 8) & 0xff; + address[2] = (in->ifa_address >> 16) & 0xff; + address[3] = in->ifa_address >> 24; + netmask[0] = in->ifa_mask & 0xff; + netmask[1] = (in->ifa_mask >> 8) & 0xff; + netmask[2] = (in->ifa_mask >> 16) & 0xff; + netmask[3] = in->ifa_mask >> 24; + (*cb)(address, netmask, arg); + in = in->ifa_next; + } +} + +int dev_netmask(void *d, void *m) +{ + struct net_device *dev = d; + struct in_device *ip = dev->ip_ptr; + struct in_ifaddr *in; + __u32 *mask_out = m; + + if(ip == NULL) + return(1); + + in = ip->ifa_list; + if(in == NULL) + return(1); + + *mask_out = in->ifa_mask; + return(0); +} + +void *get_output_buffer(int *len_out) +{ + void *ret; + + ret = (void *) __get_free_pages(GFP_KERNEL, 0); + if(ret) *len_out = PAGE_SIZE; + else *len_out = 0; + return(ret); +} + +void free_output_buffer(void *buffer) +{ + free_pages((unsigned long) buffer, 0); +} + +int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, + char **gate_addr) +{ + char *remain; + + remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL); + if(remain != NULL){ + printk("tap_setup_common - Extra garbage on specification : " + "'%s'\n", remain); + return(1); + } + + return(0); +} + +unsigned short eth_protocol(struct sk_buff *skb) +{ + return(eth_type_trans(skb, skb->dev)); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/net_kern.c~uml-summa.diff b/arch/um/drivers/net_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/net_kern.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,866 @@ +/* + * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and + * James Leu (jleu@mindspring.net). + * Copyright (C) 2001 by various other people who didn't put their name here. + * Licensed under the GPL. + */ + +#include "linux/config.h" +#include "linux/kernel.h" +#include "linux/netdevice.h" +#include "linux/rtnetlink.h" +#include "linux/skbuff.h" +#include "linux/socket.h" +#include "linux/spinlock.h" +#include "linux/module.h" +#include "linux/init.h" +#include "linux/etherdevice.h" +#include "linux/list.h" +#include "linux/inetdevice.h" +#include "linux/ctype.h" +#include "linux/bootmem.h" +#include "user_util.h" +#include "kern_util.h" +#include "net_kern.h" +#include "net_user.h" +#include "mconsole_kern.h" +#include "init.h" +#include "irq_user.h" + +static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(opened); + +static int uml_net_rx(struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + int pkt_len; + struct sk_buff *skb; + + /* If we can't allocate memory, try again next round. */ + if ((skb = dev_alloc_skb(dev->mtu)) == NULL) { + lp->stats.rx_dropped++; + return 0; + } + + skb->dev = dev; + skb_put(skb, dev->mtu); + skb->mac.raw = skb->data; + pkt_len = (*lp->read)(lp->fd, &skb, lp); + + if (pkt_len > 0) { + skb_trim(skb, pkt_len); + skb->protocol = (*lp->protocol)(skb); + netif_rx(skb); + + lp->stats.rx_bytes += skb->len; + lp->stats.rx_packets++; + return pkt_len; + } + + kfree_skb(skb); + return pkt_len; +} + +void uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct net_device *dev = dev_id; + struct uml_net_private *lp = dev->priv; + int err; + + if(!netif_running(dev)) + return; + + spin_lock(&lp->lock); + while((err = uml_net_rx(dev)) > 0) ; + if(err < 0) { + printk(KERN_ERR + "Device '%s' read returned %d, shutting it down\n", + dev->name, err); + dev_close(dev); + goto out; + } + reactivate_fd(lp->fd, UM_ETH_IRQ); + + out: + spin_unlock(&lp->lock); +} + +static int uml_net_open(struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + char addr[sizeof("255.255.255.255\0")]; + int err; + + spin_lock(&lp->lock); + + if(lp->fd >= 0){ + err = -ENXIO; + goto out; + } + + if(!lp->have_mac){ + dev_ip_addr(dev, addr, &lp->mac[2]); + set_ether_mac(dev, lp->mac); + } + + lp->fd = (*lp->open)(&lp->user); + if(lp->fd < 0){ + err = lp->fd; + goto out; + } + + err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt, + SA_INTERRUPT | SA_SHIRQ, dev->name, dev); + if(err != 0){ + printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err); + if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); + lp->fd = -1; + err = -ENETUNREACH; + } + + lp->tl.data = (unsigned long) &lp->user; + netif_start_queue(dev); + + spin_lock(&opened_lock); + list_add(&lp->list, &opened); + spin_unlock(&opened_lock); + MOD_INC_USE_COUNT; + out: + spin_unlock(&lp->lock); + return(err); +} + +static int uml_net_close(struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + + netif_stop_queue(dev); + spin_lock(&lp->lock); + + free_irq(dev->irq, dev); + if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); + lp->fd = -1; + spin_lock(&opened_lock); + list_del(&lp->list); + spin_unlock(&opened_lock); + + MOD_DEC_USE_COUNT; + spin_unlock(&lp->lock); + return 0; +} + +static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + unsigned long flags; + int len; + + netif_stop_queue(dev); + + spin_lock_irqsave(&lp->lock, flags); + + len = (*lp->write)(lp->fd, &skb, lp); + + if(len == skb->len) { + lp->stats.tx_packets++; + lp->stats.tx_bytes += skb->len; + dev->trans_start = jiffies; + netif_start_queue(dev); + + /* this is normally done in the interrupt when tx finishes */ + netif_wake_queue(dev); + } + else if(len == 0){ + netif_start_queue(dev); + lp->stats.tx_dropped++; + } + else { + netif_start_queue(dev); + printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len); + } + + spin_unlock_irqrestore(&lp->lock, flags); + + dev_kfree_skb(skb); + + return 0; +} + +static struct net_device_stats *uml_net_get_stats(struct net_device *dev) +{ + struct uml_net_private *lp = dev->priv; + return &lp->stats; +} + +static void uml_net_set_multicast_list(struct net_device *dev) +{ + if (dev->flags & IFF_PROMISC) return; + else if (dev->mc_count) dev->flags |= IFF_ALLMULTI; + else dev->flags &= ~IFF_ALLMULTI; +} + +static void uml_net_tx_timeout(struct net_device *dev) +{ + dev->trans_start = jiffies; + netif_wake_queue(dev); +} + +static int uml_net_set_mac(struct net_device *dev, void *addr) +{ + struct uml_net_private *lp = dev->priv; + struct sockaddr *hwaddr = addr; + + spin_lock(&lp->lock); + memcpy(dev->dev_addr, hwaddr->sa_data, ETH_ALEN); + spin_unlock(&lp->lock); + + return(0); +} + +static int uml_net_change_mtu(struct net_device *dev, int new_mtu) +{ + struct uml_net_private *lp = dev->priv; + int err = 0; + + spin_lock(&lp->lock); + + new_mtu = (*lp->set_mtu)(new_mtu, &lp->user); + if(new_mtu < 0){ + err = new_mtu; + goto out; + } + + dev->mtu = new_mtu; + + out: + spin_unlock(&lp->lock); + return err; +} + +static int uml_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + return(-EINVAL); +} + +void uml_net_user_timer_expire(unsigned long _conn) +{ +#ifdef undef + struct connection *conn = (struct connection *)_conn; + + dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn); + do_connect(conn); +#endif +} + +/* + * default do nothing hard header packet routines for struct net_device init. + * real ethernet transports will overwrite with real routines. + */ +static int uml_net_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, void *daddr, void *saddr, unsigned len) +{ + return(0); /* no change */ +} + +static int uml_net_rebuild_header(struct sk_buff *skb) +{ + return(0); /* ignore */ +} + +static int uml_net_header_cache(struct neighbour *neigh, struct hh_cache *hh) +{ + return(-1); /* fail */ +} + +static void uml_net_header_cache_update(struct hh_cache *hh, + struct net_device *dev, unsigned char * haddr) +{ + /* ignore */ +} + +static int uml_net_header_parse(struct sk_buff *skb, unsigned char *haddr) +{ + return(0); /* nothing */ +} + +static spinlock_t devices_lock = SPIN_LOCK_UNLOCKED; +static struct list_head devices = LIST_HEAD_INIT(devices); + +static int eth_configure(int n, void *init, char *mac, + struct transport *transport) +{ + struct uml_net *device; + struct net_device *dev; + struct uml_net_private *lp; + int err, size; + + size = transport->private_size + sizeof(struct uml_net_private) + + sizeof(((struct uml_net_private *) 0)->user); + + device = kmalloc(sizeof(*device), GFP_KERNEL); + if (device == NULL) { + printk(KERN_ERR "eth_configure failed to allocate uml_net\n"); + return(1); + } + + memset(device, 0, sizeof(*device)); + INIT_LIST_HEAD(&device->list); + device->index = n; + + spin_lock(&devices_lock); + list_add(&device->list, &devices); + spin_unlock(&devices_lock); + + if (setup_etheraddr(mac, device->mac)) + device->have_mac = 1; + + printk(KERN_INFO "Netdevice %d ", n); + if (device->have_mac) + printk("(%02x:%02x:%02x:%02x:%02x:%02x) ", + device->mac[0], device->mac[1], + device->mac[2], device->mac[3], + device->mac[4], device->mac[5]); + printk(": "); + dev = alloc_etherdev(size); + if (dev == NULL) { + printk(KERN_ERR "eth_configure: failed to allocate device\n"); + return 1; + } + + /* If this name ends up conflicting with an existing registered + * netdevice, that is OK, register_netdev{,ice}() will notice this + * and fail. + */ + snprintf(dev->name, sizeof(dev->name), "eth%d", n); + device->dev = dev; + + dev->hard_header = uml_net_hard_header; + dev->rebuild_header = uml_net_rebuild_header; + dev->hard_header_cache = uml_net_header_cache; + dev->header_cache_update= uml_net_header_cache_update; + dev->hard_header_parse = uml_net_header_parse; + + (*transport->kern->init)(dev, init); + + dev->mtu = transport->user->max_packet; + dev->open = uml_net_open; + dev->hard_start_xmit = uml_net_start_xmit; + dev->stop = uml_net_close; + dev->get_stats = uml_net_get_stats; + dev->set_multicast_list = uml_net_set_multicast_list; + dev->tx_timeout = uml_net_tx_timeout; + dev->set_mac_address = uml_net_set_mac; + dev->change_mtu = uml_net_change_mtu; + dev->do_ioctl = uml_net_ioctl; + dev->watchdog_timeo = (HZ >> 1); + dev->irq = UM_ETH_IRQ; + + rtnl_lock(); + err = register_netdevice(dev); + rtnl_unlock(); + if (err) + return 1; + lp = dev->priv; + + INIT_LIST_HEAD(&lp->list); + spin_lock_init(&lp->lock); + lp->dev = dev; + lp->fd = -1; + lp->mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0 }; + lp->have_mac = device->have_mac; + lp->protocol = transport->kern->protocol; + lp->open = transport->user->open; + lp->close = transport->user->close; + lp->remove = transport->user->remove; + lp->read = transport->kern->read; + lp->write = transport->kern->write; + lp->add_address = transport->user->add_address; + lp->delete_address = transport->user->delete_address; + lp->set_mtu = transport->user->set_mtu; + + init_timer(&lp->tl); + lp->tl.function = uml_net_user_timer_expire; + if (lp->have_mac) + memcpy(lp->mac, device->mac, sizeof(lp->mac)); + + if (transport->user->init) + (*transport->user->init)(&lp->user, dev); + + if (device->have_mac) + set_ether_mac(dev, device->mac); + return(0); +} + +static struct uml_net *find_device(int n) +{ + struct uml_net *device; + struct list_head *ele; + + spin_lock(&devices_lock); + list_for_each(ele, &devices){ + device = list_entry(ele, struct uml_net, list); + if(device->index == n) + goto out; + } + device = NULL; + out: + spin_unlock(&devices_lock); + return(device); +} + +static int eth_parse(char *str, int *index_out, char **str_out) +{ + char *end; + int n; + + n = simple_strtoul(str, &end, 0); + if(end == str){ + printk(KERN_ERR "eth_setup: Failed to parse '%s'\n", str); + return(1); + } + if(n < 0){ + printk(KERN_ERR "eth_setup: device %d is negative\n", n); + return(1); + } + str = end; + if(*str != '='){ + printk(KERN_ERR + "eth_setup: expected '=' after device number\n"); + return(1); + } + str++; + if(find_device(n)){ + printk(KERN_ERR "eth_setup: Device %d already configured\n", + n); + return(1); + } + if(index_out) *index_out = n; + *str_out = str; + return(0); +} + +struct eth_init { + struct list_head list; + char *init; + int index; +}; + +/* Filled in at boot time. Will need locking if the transports become + * modular. + */ +struct list_head transports = LIST_HEAD_INIT(transports); + +/* Filled in during early boot */ +struct list_head eth_cmd_line = LIST_HEAD_INIT(eth_cmd_line); + +static int check_transport(struct transport *transport, char *eth, int n, + void **init_out, char **mac_out) +{ + int len; + + len = strlen(transport->name); + if(strncmp(eth, transport->name, len)) + return(0); + + eth += len; + if(*eth == ',') + eth++; + else if(*eth != '\0') + return(0); + + *init_out = kmalloc(transport->setup_size, GFP_KERNEL); + if(*init_out == NULL) + return(1); + + if(!transport->setup(eth, mac_out, *init_out)){ + kfree(*init_out); + *init_out = NULL; + } + return(1); +} + +void register_transport(struct transport *new) +{ + struct list_head *ele, *next; + struct eth_init *eth; + void *init; + char *mac = NULL; + int match; + + list_add(&new->list, &transports); + + list_for_each_safe(ele, next, ð_cmd_line){ + eth = list_entry(ele, struct eth_init, list); + match = check_transport(new, eth->init, eth->index, &init, + &mac); + if(!match) + continue; + else if(init != NULL){ + eth_configure(eth->index, init, mac, new); + kfree(init); + } + list_del(ð->list); + } +} + +static int eth_setup_common(char *str, int index) +{ + struct list_head *ele; + struct transport *transport; + void *init; + char *mac = NULL; + + list_for_each(ele, &transports){ + transport = list_entry(ele, struct transport, list); + if(!check_transport(transport, str, index, &init, &mac)) + continue; + if(init != NULL){ + eth_configure(index, init, mac, transport); + kfree(init); + } + return(1); + } + return(0); +} + +static int eth_setup(char *str) +{ + struct eth_init *new; + int n, err; + + err = eth_parse(str, &n, &str); + if(err) return(1); + + new = alloc_bootmem(sizeof(new)); + if (new == NULL){ + printk("eth_init : alloc_bootmem failed\n"); + return(1); + } + + INIT_LIST_HEAD(&new->list); + new->index = n; + new->init = str; + + list_add_tail(&new->list, ð_cmd_line); + return(1); +} + +__setup("eth", eth_setup); +__uml_help(eth_setup, +"eth[0-9]+=,\n" +" Configure a network device.\n\n" +); + +static int eth_init(void) +{ + struct list_head *ele, *next; + struct eth_init *eth; + + list_for_each_safe(ele, next, ð_cmd_line){ + eth = list_entry(ele, struct eth_init, list); + + if(eth_setup_common(eth->init, eth->index)) + list_del(ð->list); + } + + return(1); +} + +__initcall(eth_init); + +static int net_config(char *str) +{ + int n, err; + + err = eth_parse(str, &n, &str); + if(err) return(err); + + str = uml_strdup(str); + if(str == NULL){ + printk(KERN_ERR "net_config failed to strdup string\n"); + return(-1); + } + err = !eth_setup_common(str, n); + if(err) + kfree(str); + return(err); +} + +static int net_remove(char *str) +{ + struct uml_net *device; + struct net_device *dev; + struct uml_net_private *lp; + char *end; + int n; + + n = simple_strtoul(str, &end, 0); + if((*end != '\0') || (end == str)) + return(-1); + + device = find_device(n); + if(device == NULL) + return(0); + + dev = device->dev; + lp = dev->priv; + if(lp->fd > 0) return(-1); + if(lp->remove != NULL) (*lp->remove)(&lp->user); + unregister_netdev(dev); + + list_del(&device->list); + free_netdev(device); + return(0); +} + +static struct mc_device net_mc = { + .name = "eth", + .config = net_config, + .get_config = NULL, + .remove = net_remove, +}; + +static int uml_inetaddr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct in_ifaddr *ifa = ptr; + u32 addr = ifa->ifa_address; + u32 netmask = ifa->ifa_mask; + struct net_device *dev = ifa->ifa_dev->dev; + struct uml_net_private *lp; + void (*proc)(unsigned char *, unsigned char *, void *); + unsigned char addr_buf[4], netmask_buf[4]; + + if(dev->open != uml_net_open) return(NOTIFY_DONE); + + lp = dev->priv; + + proc = NULL; + switch (event){ + case NETDEV_UP: + proc = lp->add_address; + break; + case NETDEV_DOWN: + proc = lp->delete_address; + break; + } + if(proc != NULL){ + addr_buf[0] = addr & 0xff; + addr_buf[1] = (addr >> 8) & 0xff; + addr_buf[2] = (addr >> 16) & 0xff; + addr_buf[3] = addr >> 24; + netmask_buf[0] = netmask & 0xff; + netmask_buf[1] = (netmask >> 8) & 0xff; + netmask_buf[2] = (netmask >> 16) & 0xff; + netmask_buf[3] = netmask >> 24; + (*proc)(addr_buf, netmask_buf, &lp->user); + } + return(NOTIFY_DONE); +} + +struct notifier_block uml_inetaddr_notifier = { + .notifier_call = uml_inetaddr_event, +}; + +static int uml_net_init(void) +{ + struct list_head *ele; + struct uml_net_private *lp; + struct in_device *ip; + struct in_ifaddr *in; + + mconsole_register_dev(&net_mc); + register_inetaddr_notifier(¨_inetaddr_notifier); + + /* Devices may have been opened already, so the uml_inetaddr_notifier + * didn't get a chance to run for them. This fakes it so that + * addresses which have already been set up get handled properly. + */ + list_for_each(ele, &opened){ + lp = list_entry(ele, struct uml_net_private, list); + ip = lp->dev->ip_ptr; + if(ip == NULL) continue; + in = ip->ifa_list; + while(in != NULL){ + uml_inetaddr_event(NULL, NETDEV_UP, in); + in = in->ifa_next; + } + } + + return(0); +} + +__initcall(uml_net_init); + +static void close_devices(void) +{ + struct list_head *ele; + struct uml_net_private *lp; + + list_for_each(ele, &opened){ + lp = list_entry(ele, struct uml_net_private, list); + if(lp->close != NULL) (*lp->close)(lp->fd, &lp->user); + if(lp->remove != NULL) (*lp->remove)(&lp->user); + } +} + +__uml_exitcall(close_devices); + +int setup_etheraddr(char *str, unsigned char *addr) +{ + char *end; + int i; + + if(str == NULL) + return(0); + for(i=0;i<6;i++){ + addr[i] = simple_strtoul(str, &end, 16); + if((end == str) || + ((*end != ':') && (*end != ',') && (*end != '\0'))){ + printk(KERN_ERR + "setup_etheraddr: failed to parse '%s' " + "as an ethernet address\n", str); + return(0); + } + str = end + 1; + } + if(addr[0] & 1){ + printk(KERN_ERR + "Attempt to assign a broadcast ethernet address to a " + "device disallowed\n"); + return(0); + } + return(1); +} + +void dev_ip_addr(void *d, char *buf, char *bin_buf) +{ + struct net_device *dev = d; + struct in_device *ip = dev->ip_ptr; + struct in_ifaddr *in; + u32 addr; + + if((ip == NULL) || ((in = ip->ifa_list) == NULL)){ + printk(KERN_WARNING "dev_ip_addr - device not assigned an " + "IP address\n"); + return; + } + addr = in->ifa_address; + sprintf(buf, "%d.%d.%d.%d", addr & 0xff, (addr >> 8) & 0xff, + (addr >> 16) & 0xff, addr >> 24); + if(bin_buf){ + bin_buf[0] = addr & 0xff; + bin_buf[1] = (addr >> 8) & 0xff; + bin_buf[2] = (addr >> 16) & 0xff; + bin_buf[3] = addr >> 24; + } +} + +void set_ether_mac(void *d, unsigned char *addr) +{ + struct net_device *dev = d; + + memcpy(dev->dev_addr, addr, ETH_ALEN); +} + +struct sk_buff *ether_adjust_skb(struct sk_buff *skb, int extra) +{ + if((skb != NULL) && (skb_tailroom(skb) < extra)){ + struct sk_buff *skb2; + + skb2 = skb_copy_expand(skb, 0, extra, GFP_ATOMIC); + dev_kfree_skb(skb); + skb = skb2; + } + if(skb != NULL) skb_put(skb, extra); + return(skb); +} + +void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *, + void *), + void *arg) +{ + struct net_device *dev = d; + struct in_device *ip = dev->ip_ptr; + struct in_ifaddr *in; + unsigned char address[4], netmask[4]; + + if(ip == NULL) return; + in = ip->ifa_list; + while(in != NULL){ + address[0] = in->ifa_address & 0xff; + address[1] = (in->ifa_address >> 8) & 0xff; + address[2] = (in->ifa_address >> 16) & 0xff; + address[3] = in->ifa_address >> 24; + netmask[0] = in->ifa_mask & 0xff; + netmask[1] = (in->ifa_mask >> 8) & 0xff; + netmask[2] = (in->ifa_mask >> 16) & 0xff; + netmask[3] = in->ifa_mask >> 24; + (*cb)(address, netmask, arg); + in = in->ifa_next; + } +} + +int dev_netmask(void *d, void *m) +{ + struct net_device *dev = d; + struct in_device *ip = dev->ip_ptr; + struct in_ifaddr *in; + __u32 *mask_out = m; + + if(ip == NULL) + return(1); + + in = ip->ifa_list; + if(in == NULL) + return(1); + + *mask_out = in->ifa_mask; + return(0); +} + +void *get_output_buffer(int *len_out) +{ + void *ret; + + ret = (void *) __get_free_pages(GFP_KERNEL, 0); + if(ret) *len_out = PAGE_SIZE; + else *len_out = 0; + return(ret); +} + +void free_output_buffer(void *buffer) +{ + free_pages((unsigned long) buffer, 0); +} + +int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, + char **gate_addr) +{ + char *remain; + + remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL); + if(remain != NULL){ + printk("tap_setup_common - Extra garbage on specification : " + "'%s'\n", remain); + return(1); + } + + return(0); +} + +unsigned short eth_protocol(struct sk_buff *skb) +{ + return(eth_type_trans(skb, skb->dev)); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c --- a/arch/um/drivers/port_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/port_kern.c Fri Oct 31 14:10:53 2003 @@ -6,6 +6,7 @@ #include "linux/list.h" #include "linux/sched.h" #include "linux/slab.h" +#include "linux/interrupt.h" #include "linux/irq.h" #include "linux/spinlock.h" #include "linux/errno.h" @@ -44,7 +45,7 @@ struct port_list *port; }; -static void pipe_interrupt(int irq, void *data, struct pt_regs *regs) +static irqreturn_t pipe_interrupt(int irq, void *data, struct pt_regs *regs) { struct connection *conn = data; int fd; @@ -52,7 +53,7 @@ fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); if(fd < 0){ if(fd == -EAGAIN) - return; + return(IRQ_NONE); printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", -fd); @@ -65,6 +66,7 @@ list_add(&conn->list, &conn->port->connections); up(&conn->port->sem); + return(IRQ_HANDLED); } static int port_accept(struct port_list *port) @@ -138,12 +140,13 @@ DECLARE_WORK(port_work, port_work_proc, NULL); -static void port_interrupt(int irq, void *data, struct pt_regs *regs) +static irqreturn_t port_interrupt(int irq, void *data, struct pt_regs *regs) { struct port_list *port = data; port->has_connection = 1; schedule_work(&port_work); + return(IRQ_HANDLED); } void *port_data(int port_num) diff -Nru a/arch/um/drivers/port_kern.c~uml-kill-irq_kern.h.diff b/arch/um/drivers/port_kern.c~uml-kill-irq_kern.h.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/port_kern.c~uml-kill-irq_kern.h.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,304 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/list.h" +#include "linux/sched.h" +#include "linux/slab.h" +#include "linux/interrupt.h" +#include "linux/irq.h" +#include "linux/spinlock.h" +#include "linux/errno.h" +#include "asm/semaphore.h" +#include "asm/errno.h" +#include "kern_util.h" +#include "kern.h" +#include "irq_user.h" +#include "irq_kern.h" +#include "port.h" +#include "init.h" +#include "os.h" + +struct port_list { + struct list_head list; + int has_connection; + struct semaphore sem; + int port; + int fd; + spinlock_t lock; + struct list_head pending; + struct list_head connections; +}; + +struct port_dev { + struct port_list *port; + int helper_pid; + int telnetd_pid; +}; + +struct connection { + struct list_head list; + int fd; + int helper_pid; + int socket[2]; + int telnetd_pid; + struct port_list *port; +}; + +static irqreturn_t pipe_interrupt(int irq, void *data, struct pt_regs *regs) +{ + struct connection *conn = data; + int fd; + + fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); + if(fd < 0){ + if(fd == -EAGAIN) + return(IRQ_NONE); + + printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", + -fd); + os_close_file(conn->fd); + } + + list_del(&conn->list); + + conn->fd = fd; + list_add(&conn->list, &conn->port->connections); + + up(&conn->port->sem); + return(IRQ_HANDLED); +} + +static int port_accept(struct port_list *port) +{ + struct connection *conn; + int fd, socket[2], pid, ret = 0; + + fd = port_connection(port->fd, socket, &pid); + if(fd < 0){ + if(fd != -EAGAIN) + printk(KERN_ERR "port_accept : port_connection " + "returned %d\n", -fd); + goto out; + } + + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if(conn == NULL){ + printk(KERN_ERR "port_accept : failed to allocate " + "connection\n"); + goto out_close; + } + *conn = ((struct connection) + { .list = LIST_HEAD_INIT(conn->list), + .fd = fd, + .socket = { socket[0], socket[1] }, + .telnetd_pid = pid, + .port = port }); + + if(um_request_irq(TELNETD_IRQ, socket[0], IRQ_READ, pipe_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, + "telnetd", conn)){ + printk(KERN_ERR "port_accept : failed to get IRQ for " + "telnetd\n"); + goto out_free; + } + + list_add(&conn->list, &port->pending); + ret = 1; + goto out; + + out_free: + kfree(conn); + out_close: + os_close_file(fd); + if(pid != -1) + os_kill_process(pid, 1); + out: + return(ret); +} + +DECLARE_MUTEX(ports_sem); +struct list_head ports = LIST_HEAD_INIT(ports); + +void port_work_proc(void *unused) +{ + struct port_list *port; + struct list_head *ele; + unsigned long flags; + + local_irq_save(flags); + list_for_each(ele, &ports){ + port = list_entry(ele, struct port_list, list); + if(!port->has_connection) + continue; + reactivate_fd(port->fd, ACCEPT_IRQ); + while(port_accept(port)) ; + port->has_connection = 0; + } + local_irq_restore(flags); +} + +DECLARE_WORK(port_work, port_work_proc, NULL); + +static irqreturn_t port_interrupt(int irq, void *data, struct pt_regs *regs) +{ + struct port_list *port = data; + + port->has_connection = 1; + schedule_work(&port_work); + return(IRQ_HANDLED); +} + +void *port_data(int port_num) +{ + struct list_head *ele; + struct port_list *port; + struct port_dev *dev = NULL; + int fd; + + down(&ports_sem); + list_for_each(ele, &ports){ + port = list_entry(ele, struct port_list, list); + if(port->port == port_num) goto found; + } + port = kmalloc(sizeof(struct port_list), GFP_KERNEL); + if(port == NULL){ + printk(KERN_ERR "Allocation of port list failed\n"); + goto out; + } + + fd = port_listen_fd(port_num); + if(fd < 0){ + printk(KERN_ERR "binding to port %d failed, errno = %d\n", + port_num, -fd); + goto out_free; + } + if(um_request_irq(ACCEPT_IRQ, fd, IRQ_READ, port_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, "port", + port)){ + printk(KERN_ERR "Failed to get IRQ for port %d\n", port_num); + goto out_close; + } + + *port = ((struct port_list) + { .list = LIST_HEAD_INIT(port->list), + .has_connection = 0, + .sem = __SEMAPHORE_INITIALIZER(port->sem, + 0), + .lock = SPIN_LOCK_UNLOCKED, + .port = port_num, + .fd = fd, + .pending = LIST_HEAD_INIT(port->pending), + .connections = LIST_HEAD_INIT(port->connections) }); + list_add(&port->list, &ports); + + found: + dev = kmalloc(sizeof(struct port_dev), GFP_KERNEL); + if(dev == NULL){ + printk(KERN_ERR "Allocation of port device entry failed\n"); + goto out; + } + + *dev = ((struct port_dev) { .port = port, + .helper_pid = -1, + .telnetd_pid = -1 }); + goto out; + + out_free: + kfree(port); + out_close: + os_close_file(fd); + out: + up(&ports_sem); + return(dev); +} + +int port_wait(void *data) +{ + struct port_dev *dev = data; + struct connection *conn; + struct port_list *port = dev->port; + int fd; + + while(1){ + if(down_interruptible(&port->sem)) + return(-ERESTARTSYS); + + spin_lock(&port->lock); + + conn = list_entry(port->connections.next, struct connection, + list); + list_del(&conn->list); + spin_unlock(&port->lock); + + os_shutdown_socket(conn->socket[0], 1, 1); + os_close_file(conn->socket[0]); + os_shutdown_socket(conn->socket[1], 1, 1); + os_close_file(conn->socket[1]); + + /* This is done here because freeing an IRQ can't be done + * within the IRQ handler. So, pipe_interrupt always ups + * the semaphore regardless of whether it got a successful + * connection. Then we loop here throwing out failed + * connections until a good one is found. + */ + free_irq(TELNETD_IRQ, conn); + + if(conn->fd >= 0) break; + os_close_file(conn->fd); + kfree(conn); + } + + fd = conn->fd; + dev->helper_pid = conn->helper_pid; + dev->telnetd_pid = conn->telnetd_pid; + kfree(conn); + + return(fd); +} + +void port_remove_dev(void *d) +{ + struct port_dev *dev = d; + + if(dev->helper_pid != -1) + os_kill_process(dev->helper_pid, 0); + if(dev->telnetd_pid != -1) + os_kill_process(dev->telnetd_pid, 1); + dev->helper_pid = -1; + dev->telnetd_pid = -1; +} + +void port_kern_free(void *d) +{ + struct port_dev *dev = d; + + port_remove_dev(dev); + kfree(dev); +} + +static void free_port(void) +{ + struct list_head *ele; + struct port_list *port; + + list_for_each(ele, &ports){ + port = list_entry(ele, struct port_list, list); + free_irq_by_fd(port->fd); + os_close_file(port->fd); + } +} + +__uml_exitcall(free_port); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/port_kern.c~uml-summa.diff b/arch/um/drivers/port_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/port_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,300 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/list.h" +#include "linux/sched.h" +#include "linux/slab.h" +#include "linux/irq.h" +#include "linux/spinlock.h" +#include "linux/errno.h" +#include "asm/semaphore.h" +#include "asm/errno.h" +#include "kern_util.h" +#include "kern.h" +#include "irq_user.h" +#include "port.h" +#include "init.h" +#include "os.h" + +struct port_list { + struct list_head list; + int has_connection; + struct semaphore sem; + int port; + int fd; + spinlock_t lock; + struct list_head pending; + struct list_head connections; +}; + +struct port_dev { + struct port_list *port; + int helper_pid; + int telnetd_pid; +}; + +struct connection { + struct list_head list; + int fd; + int helper_pid; + int socket[2]; + int telnetd_pid; + struct port_list *port; +}; + +static void pipe_interrupt(int irq, void *data, struct pt_regs *regs) +{ + struct connection *conn = data; + int fd; + + fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); + if(fd < 0){ + if(fd == -EAGAIN) + return; + + printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", + -fd); + os_close_file(conn->fd); + } + + list_del(&conn->list); + + conn->fd = fd; + list_add(&conn->list, &conn->port->connections); + + up(&conn->port->sem); +} + +static int port_accept(struct port_list *port) +{ + struct connection *conn; + int fd, socket[2], pid, ret = 0; + + fd = port_connection(port->fd, socket, &pid); + if(fd < 0){ + if(fd != -EAGAIN) + printk(KERN_ERR "port_accept : port_connection " + "returned %d\n", -fd); + goto out; + } + + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if(conn == NULL){ + printk(KERN_ERR "port_accept : failed to allocate " + "connection\n"); + goto out_close; + } + *conn = ((struct connection) + { .list = LIST_HEAD_INIT(conn->list), + .fd = fd, + .socket = { socket[0], socket[1] }, + .telnetd_pid = pid, + .port = port }); + + if(um_request_irq(TELNETD_IRQ, socket[0], IRQ_READ, pipe_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, + "telnetd", conn)){ + printk(KERN_ERR "port_accept : failed to get IRQ for " + "telnetd\n"); + goto out_free; + } + + list_add(&conn->list, &port->pending); + ret = 1; + goto out; + + out_free: + kfree(conn); + out_close: + os_close_file(fd); + if(pid != -1) + os_kill_process(pid, 1); + out: + return(ret); +} + +DECLARE_MUTEX(ports_sem); +struct list_head ports = LIST_HEAD_INIT(ports); + +void port_work_proc(void *unused) +{ + struct port_list *port; + struct list_head *ele; + unsigned long flags; + + local_irq_save(flags); + list_for_each(ele, &ports){ + port = list_entry(ele, struct port_list, list); + if(!port->has_connection) + continue; + reactivate_fd(port->fd, ACCEPT_IRQ); + while(port_accept(port)) ; + port->has_connection = 0; + } + local_irq_restore(flags); +} + +DECLARE_WORK(port_work, port_work_proc, NULL); + +static void port_interrupt(int irq, void *data, struct pt_regs *regs) +{ + struct port_list *port = data; + + port->has_connection = 1; + schedule_work(&port_work); +} + +void *port_data(int port_num) +{ + struct list_head *ele; + struct port_list *port; + struct port_dev *dev = NULL; + int fd; + + down(&ports_sem); + list_for_each(ele, &ports){ + port = list_entry(ele, struct port_list, list); + if(port->port == port_num) goto found; + } + port = kmalloc(sizeof(struct port_list), GFP_KERNEL); + if(port == NULL){ + printk(KERN_ERR "Allocation of port list failed\n"); + goto out; + } + + fd = port_listen_fd(port_num); + if(fd < 0){ + printk(KERN_ERR "binding to port %d failed, errno = %d\n", + port_num, -fd); + goto out_free; + } + if(um_request_irq(ACCEPT_IRQ, fd, IRQ_READ, port_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, "port", + port)){ + printk(KERN_ERR "Failed to get IRQ for port %d\n", port_num); + goto out_close; + } + + *port = ((struct port_list) + { .list = LIST_HEAD_INIT(port->list), + .has_connection = 0, + .sem = __SEMAPHORE_INITIALIZER(port->sem, + 0), + .lock = SPIN_LOCK_UNLOCKED, + .port = port_num, + .fd = fd, + .pending = LIST_HEAD_INIT(port->pending), + .connections = LIST_HEAD_INIT(port->connections) }); + list_add(&port->list, &ports); + + found: + dev = kmalloc(sizeof(struct port_dev), GFP_KERNEL); + if(dev == NULL){ + printk(KERN_ERR "Allocation of port device entry failed\n"); + goto out; + } + + *dev = ((struct port_dev) { .port = port, + .helper_pid = -1, + .telnetd_pid = -1 }); + goto out; + + out_free: + kfree(port); + out_close: + os_close_file(fd); + out: + up(&ports_sem); + return(dev); +} + +int port_wait(void *data) +{ + struct port_dev *dev = data; + struct connection *conn; + struct port_list *port = dev->port; + int fd; + + while(1){ + if(down_interruptible(&port->sem)) + return(-ERESTARTSYS); + + spin_lock(&port->lock); + + conn = list_entry(port->connections.next, struct connection, + list); + list_del(&conn->list); + spin_unlock(&port->lock); + + os_shutdown_socket(conn->socket[0], 1, 1); + os_close_file(conn->socket[0]); + os_shutdown_socket(conn->socket[1], 1, 1); + os_close_file(conn->socket[1]); + + /* This is done here because freeing an IRQ can't be done + * within the IRQ handler. So, pipe_interrupt always ups + * the semaphore regardless of whether it got a successful + * connection. Then we loop here throwing out failed + * connections until a good one is found. + */ + free_irq(TELNETD_IRQ, conn); + + if(conn->fd >= 0) break; + os_close_file(conn->fd); + kfree(conn); + } + + fd = conn->fd; + dev->helper_pid = conn->helper_pid; + dev->telnetd_pid = conn->telnetd_pid; + kfree(conn); + + return(fd); +} + +void port_remove_dev(void *d) +{ + struct port_dev *dev = d; + + if(dev->helper_pid != -1) + os_kill_process(dev->helper_pid, 0); + if(dev->telnetd_pid != -1) + os_kill_process(dev->telnetd_pid, 1); + dev->helper_pid = -1; + dev->telnetd_pid = -1; +} + +void port_kern_free(void *d) +{ + struct port_dev *dev = d; + + port_remove_dev(dev); + kfree(dev); +} + +static void free_port(void) +{ + struct list_head *ele; + struct port_list *port; + + list_for_each(ele, &ports){ + port = list_entry(ele, struct port_list, list); + free_irq_by_fd(port->fd); + os_close_file(port->fd); + } +} + +__uml_exitcall(free_port); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/ssl.c b/arch/um/drivers/ssl.c --- a/arch/um/drivers/ssl.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/ssl.c Fri Oct 31 14:10:53 2003 @@ -53,8 +53,9 @@ static struct line_driver driver = { .name = "UML serial line", - .devfs_name = "tts/%d", - .major = TTYAUX_MAJOR, + .device_name = "ttS", + .devfs_name = "tts/", + .major = TTY_MAJOR, .minor_start = 64, .type = TTY_DRIVER_TYPE_SERIAL, .subtype = 0, diff -Nru a/arch/um/drivers/ssl.c~uml-summa.diff b/arch/um/drivers/ssl.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/ssl.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,264 @@ +/* + * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/fs.h" +#include "linux/tty.h" +#include "linux/tty_driver.h" +#include "linux/major.h" +#include "linux/mm.h" +#include "linux/init.h" +#include "asm/termbits.h" +#include "asm/irq.h" +#include "line.h" +#include "ssl.h" +#include "chan_kern.h" +#include "user_util.h" +#include "kern_util.h" +#include "kern.h" +#include "init.h" +#include "irq_user.h" +#include "mconsole_kern.h" +#include "2_5compat.h" + +static int ssl_version = 1; + +/* Referenced only by tty_driver below - presumably it's locked correctly + * by the tty driver. + */ + +static struct tty_driver *ssl_driver; + +#define NR_PORTS 64 + +void ssl_announce(char *dev_name, int dev) +{ + printk(KERN_INFO "Serial line %d assigned device '%s'\n", dev, + dev_name); +} + +static struct chan_opts opts = { + .announce = ssl_announce, + .xterm_title = "Serial Line #%d", + .raw = 1, + .tramp_stack = 0, + .in_kernel = 1, +}; + +static int ssl_config(char *str); +static int ssl_get_config(char *dev, char *str, int size, char **error_out); +static int ssl_remove(char *str); + +static struct line_driver driver = { + .name = "UML serial line", + .devfs_name = "tts/%d", + .major = TTYAUX_MAJOR, + .minor_start = 64, + .type = TTY_DRIVER_TYPE_SERIAL, + .subtype = 0, + .read_irq = SSL_IRQ, + .read_irq_name = "ssl", + .write_irq = SSL_WRITE_IRQ, + .write_irq_name = "ssl-write", + .symlink_from = "serial", + .symlink_to = "tts", + .mc = { + .name = "ssl", + .config = ssl_config, + .get_config = ssl_get_config, + .remove = ssl_remove, + }, +}; + +/* The array is initialized by line_init, which is an initcall. The + * individual elements are protected by individual semaphores. + */ +static struct line serial_lines[NR_PORTS] = + { [0 ... NR_PORTS - 1] = LINE_INIT(CONFIG_SSL_CHAN, &driver) }; + +static struct lines lines = LINES_INIT(NR_PORTS); + +static int ssl_config(char *str) +{ + return(line_config(serial_lines, + sizeof(serial_lines)/sizeof(serial_lines[0]), str)); +} + +static int ssl_get_config(char *dev, char *str, int size, char **error_out) +{ + return(line_get_config(dev, serial_lines, + sizeof(serial_lines)/sizeof(serial_lines[0]), + str, size, error_out)); +} + +static int ssl_remove(char *str) +{ + return(line_remove(serial_lines, + sizeof(serial_lines)/sizeof(serial_lines[0]), str)); +} + +int ssl_open(struct tty_struct *tty, struct file *filp) +{ + return(line_open(serial_lines, tty, &opts)); +} + +static void ssl_close(struct tty_struct *tty, struct file * filp) +{ + line_close(serial_lines, tty); +} + +static int ssl_write(struct tty_struct * tty, int from_user, + const unsigned char *buf, int count) +{ + return(line_write(serial_lines, tty, from_user, buf, count)); +} + +static void ssl_put_char(struct tty_struct *tty, unsigned char ch) +{ + line_write(serial_lines, tty, 0, &ch, sizeof(ch)); +} + +static void ssl_flush_chars(struct tty_struct *tty) +{ + return; +} + +static int ssl_chars_in_buffer(struct tty_struct *tty) +{ + return(0); +} + +static void ssl_flush_buffer(struct tty_struct *tty) +{ + return; +} + +static int ssl_ioctl(struct tty_struct *tty, struct file * file, + unsigned int cmd, unsigned long arg) +{ + int ret; + + ret = 0; + switch(cmd){ + case TCGETS: + case TCSETS: + case TCFLSH: + case TCSETSF: + case TCSETSW: + case TCGETA: + case TIOCMGET: + ret = -ENOIOCTLCMD; + break; + default: + printk(KERN_ERR + "Unimplemented ioctl in ssl_ioctl : 0x%x\n", cmd); + ret = -ENOIOCTLCMD; + break; + } + return(ret); +} + +static void ssl_throttle(struct tty_struct * tty) +{ + printk(KERN_ERR "Someone should implement ssl_throttle\n"); +} + +static void ssl_unthrottle(struct tty_struct * tty) +{ + printk(KERN_ERR "Someone should implement ssl_unthrottle\n"); +} + +static void ssl_set_termios(struct tty_struct *tty, + struct termios *old_termios) +{ +} + +static void ssl_stop(struct tty_struct *tty) +{ + printk(KERN_ERR "Someone should implement ssl_stop\n"); +} + +static void ssl_start(struct tty_struct *tty) +{ + printk(KERN_ERR "Someone should implement ssl_start\n"); +} + +void ssl_hangup(struct tty_struct *tty) +{ +} + +static struct tty_operations ssl_ops = { + .open = ssl_open, + .close = ssl_close, + .write = ssl_write, + .put_char = ssl_put_char, + .flush_chars = ssl_flush_chars, + .chars_in_buffer = ssl_chars_in_buffer, + .flush_buffer = ssl_flush_buffer, + .ioctl = ssl_ioctl, + .throttle = ssl_throttle, + .unthrottle = ssl_unthrottle, + .set_termios = ssl_set_termios, + .stop = ssl_stop, + .start = ssl_start, + .hangup = ssl_hangup, + .write_room = line_write_room, +}; + +/* Changed by ssl_init and referenced by ssl_exit, which are both serialized + * by being an initcall and exitcall, respectively. + */ +static int ssl_init_done = 0; + +int ssl_init(void) +{ + char *new_title; + + printk(KERN_INFO "Initializing software serial port version %d\n", + ssl_version); + + ssl_driver = line_register_devfs(&lines, &driver, &ssl_ops, + serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0])); + + lines_init(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0])); + + new_title = add_xterm_umid(opts.xterm_title); + if(new_title != NULL) opts.xterm_title = new_title; + + ssl_init_done = 1; + return(0); +} + +__initcall(ssl_init); + +static int ssl_chan_setup(char *str) +{ + line_setup(serial_lines, sizeof(serial_lines)/sizeof(serial_lines[0]), + str, 1); + return(1); +} + +__setup("ssl", ssl_chan_setup); +__channel_help(ssl_chan_setup, "ssl"); + +static void ssl_exit(void) +{ + if(!ssl_init_done) return; + close_lines(serial_lines, + sizeof(serial_lines)/sizeof(serial_lines[0])); +} + +__uml_exitcall(ssl_exit); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/stdio_console.c b/arch/um/drivers/stdio_console.c --- a/arch/um/drivers/stdio_console.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/stdio_console.c Fri Oct 31 14:10:53 2003 @@ -83,7 +83,8 @@ static struct line_driver driver = { .name = "UML console", - .devfs_name = "vc/%d", + .device_name = "tty", + .devfs_name = "vc/", .major = TTY_MAJOR, .minor_start = 0, .type = TTY_DRIVER_TYPE_CONSOLE, @@ -159,14 +160,28 @@ static int con_init_done = 0; +static struct tty_operations console_ops = { + .open = con_open, + .close = con_close, + .write = con_write, + .chars_in_buffer = chars_in_buffer, + .set_termios = set_termios, + .write_room = line_write_room, +}; + +extern int tty_init(void); + int stdio_init(void) { char *new_title; printk(KERN_INFO "Initializing stdio console driver\n"); + tty_init(); + console_driver = line_register_devfs(&console_lines, &driver, - &console_ops, vts, sizeof(vts)/sizeof(vts[0])); + &console_ops, vts, + sizeof(vts)/sizeof(vts[0])); lines_init(vts, sizeof(vts)/sizeof(vts[0])); @@ -188,15 +203,6 @@ if(con_init_done) up(&vts[console->index].sem); } -static struct tty_operations console_ops = { - .open = con_open, - .close = con_close, - .write = con_write, - .chars_in_buffer = chars_in_buffer, - .set_termios = set_termios, - .write_room = line_write_room, -}; - static struct tty_driver *console_device(struct console *c, int *index) { *index = c->index; @@ -212,12 +218,14 @@ console_device, console_setup, CON_PRINTBUFFER); -static void __init stdio_console_init(void) +static int __init stdio_console_init(void) { INIT_LIST_HEAD(&vts[0].chan_list); list_add(&init_console_chan.list, &vts[0].chan_list); register_console(&stdiocons); + return(0); } + console_initcall(stdio_console_init); static int console_chan_setup(char *str) diff -Nru a/arch/um/drivers/stdio_console.c~uml-summa.diff b/arch/um/drivers/stdio_console.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/stdio_console.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,249 @@ +/* + * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/posix_types.h" +#include "linux/tty.h" +#include "linux/tty_flip.h" +#include "linux/types.h" +#include "linux/major.h" +#include "linux/kdev_t.h" +#include "linux/console.h" +#include "linux/string.h" +#include "linux/sched.h" +#include "linux/list.h" +#include "linux/init.h" +#include "linux/interrupt.h" +#include "linux/slab.h" +#include "asm/current.h" +#include "asm/hardirq.h" +#include "asm/irq.h" +#include "stdio_console.h" +#include "line.h" +#include "chan_kern.h" +#include "user_util.h" +#include "kern_util.h" +#include "irq_user.h" +#include "mconsole_kern.h" +#include "init.h" +#include "2_5compat.h" + +#define MAX_TTYS (8) + +/* Referenced only by tty_driver below - presumably it's locked correctly + * by the tty driver. + */ + +static struct tty_driver *console_driver; + +static struct chan_ops init_console_ops = { + .type = "you shouldn't see this", + .init = NULL, + .open = NULL, + .close = NULL, + .read = NULL, + .write = NULL, + .console_write = generic_write, + .window_size = NULL, + .free = NULL, + .winch = 0, +}; + +static struct chan init_console_chan = { + .list = { }, + .primary = 1, + .input = 0, + .output = 1, + .opened = 1, + .fd = 1, + .pri = INIT_STATIC, + .ops = &init_console_ops, + .data = NULL +}; + +void stdio_announce(char *dev_name, int dev) +{ + printk(KERN_INFO "Virtual console %d assigned device '%s'\n", dev, + dev_name); +} + +static struct chan_opts opts = { + .announce = stdio_announce, + .xterm_title = "Virtual Console #%d", + .raw = 1, + .tramp_stack = 0, + .in_kernel = 1, +}; + +static int con_config(char *str); +static int con_get_config(char *dev, char *str, int size, char **error_out); +static int con_remove(char *str); + +static struct line_driver driver = { + .name = "UML console", + .devfs_name = "vc/%d", + .major = TTY_MAJOR, + .minor_start = 0, + .type = TTY_DRIVER_TYPE_CONSOLE, + .subtype = SYSTEM_TYPE_CONSOLE, + .read_irq = CONSOLE_IRQ, + .read_irq_name = "console", + .write_irq = CONSOLE_WRITE_IRQ, + .write_irq_name = "console-write", + .symlink_from = "ttys", + .symlink_to = "vc", + .mc = { + .name = "con", + .config = con_config, + .get_config = con_get_config, + .remove = con_remove, + }, +}; + +static struct lines console_lines = LINES_INIT(MAX_TTYS); + +/* The array is initialized by line_init, which is an initcall. The + * individual elements are protected by individual semaphores. + */ +struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver), + [ 1 ... MAX_TTYS - 1 ] = + LINE_INIT(CONFIG_CON_CHAN, &driver) }; + +static int con_config(char *str) +{ + return(line_config(vts, sizeof(vts)/sizeof(vts[0]), str)); +} + +static int con_get_config(char *dev, char *str, int size, char **error_out) +{ + return(line_get_config(dev, vts, sizeof(vts)/sizeof(vts[0]), str, + size, error_out)); +} + +static int con_remove(char *str) +{ + return(line_remove(vts, sizeof(vts)/sizeof(vts[0]), str)); +} + +static int open_console(struct tty_struct *tty) +{ + return(line_open(vts, tty, &opts)); +} + +static int con_open(struct tty_struct *tty, struct file *filp) +{ + return(open_console(tty)); +} + +static void con_close(struct tty_struct *tty, struct file *filp) +{ + line_close(vts, tty); +} + +static int con_write(struct tty_struct *tty, int from_user, + const unsigned char *buf, int count) +{ + return(line_write(vts, tty, from_user, buf, count)); +} + +static void set_termios(struct tty_struct *tty, struct termios * old) +{ +} + +static int chars_in_buffer(struct tty_struct *tty) +{ + return(0); +} + +static int con_init_done = 0; + +int stdio_init(void) +{ + char *new_title; + + printk(KERN_INFO "Initializing stdio console driver\n"); + + console_driver = line_register_devfs(&console_lines, &driver, + &console_ops, vts, sizeof(vts)/sizeof(vts[0])); + + lines_init(vts, sizeof(vts)/sizeof(vts[0])); + + new_title = add_xterm_umid(opts.xterm_title); + if(new_title != NULL) opts.xterm_title = new_title; + + open_console(NULL); + con_init_done = 1; + return(0); +} + +__initcall(stdio_init); + +static void console_write(struct console *console, const char *string, + unsigned len) +{ + if(con_init_done) down(&vts[console->index].sem); + console_write_chan(&vts[console->index].chan_list, string, len); + if(con_init_done) up(&vts[console->index].sem); +} + +static struct tty_operations console_ops = { + .open = con_open, + .close = con_close, + .write = con_write, + .chars_in_buffer = chars_in_buffer, + .set_termios = set_termios, + .write_room = line_write_room, +}; + +static struct tty_driver *console_device(struct console *c, int *index) +{ + *index = c->index; + return console_driver; +} + +static int console_setup(struct console *co, char *options) +{ + return(0); +} + +static struct console stdiocons = INIT_CONSOLE("tty", console_write, + console_device, console_setup, + CON_PRINTBUFFER); + +static void __init stdio_console_init(void) +{ + INIT_LIST_HEAD(&vts[0].chan_list); + list_add(&init_console_chan.list, &vts[0].chan_list); + register_console(&stdiocons); +} +console_initcall(stdio_console_init); + +static int console_chan_setup(char *str) +{ + line_setup(vts, sizeof(vts)/sizeof(vts[0]), str, 1); + return(1); +} + +__setup("con", console_chan_setup); +__channel_help(console_chan_setup, "con"); + +static void console_exit(void) +{ + if(!con_init_done) return; + close_lines(vts, sizeof(vts)/sizeof(vts[0])); +} + +__uml_exitcall(console_exit); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/stdio_console.c~uml-tty-init.diff b/arch/um/drivers/stdio_console.c~uml-tty-init.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/stdio_console.c~uml-tty-init.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,253 @@ +/* + * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/posix_types.h" +#include "linux/tty.h" +#include "linux/tty_flip.h" +#include "linux/types.h" +#include "linux/major.h" +#include "linux/kdev_t.h" +#include "linux/console.h" +#include "linux/string.h" +#include "linux/sched.h" +#include "linux/list.h" +#include "linux/init.h" +#include "linux/interrupt.h" +#include "linux/slab.h" +#include "asm/current.h" +#include "asm/hardirq.h" +#include "asm/irq.h" +#include "stdio_console.h" +#include "line.h" +#include "chan_kern.h" +#include "user_util.h" +#include "kern_util.h" +#include "irq_user.h" +#include "mconsole_kern.h" +#include "init.h" +#include "2_5compat.h" + +#define MAX_TTYS (8) + +/* Referenced only by tty_driver below - presumably it's locked correctly + * by the tty driver. + */ + +static struct tty_driver *console_driver; + +static struct chan_ops init_console_ops = { + .type = "you shouldn't see this", + .init = NULL, + .open = NULL, + .close = NULL, + .read = NULL, + .write = NULL, + .console_write = generic_write, + .window_size = NULL, + .free = NULL, + .winch = 0, +}; + +static struct chan init_console_chan = { + .list = { }, + .primary = 1, + .input = 0, + .output = 1, + .opened = 1, + .fd = 1, + .pri = INIT_STATIC, + .ops = &init_console_ops, + .data = NULL +}; + +void stdio_announce(char *dev_name, int dev) +{ + printk(KERN_INFO "Virtual console %d assigned device '%s'\n", dev, + dev_name); +} + +static struct chan_opts opts = { + .announce = stdio_announce, + .xterm_title = "Virtual Console #%d", + .raw = 1, + .tramp_stack = 0, + .in_kernel = 1, +}; + +static int con_config(char *str); +static int con_get_config(char *dev, char *str, int size, char **error_out); +static int con_remove(char *str); + +static struct line_driver driver = { + .name = "UML console", + .device_name = "tty", + .devfs_name = "vc/", + .major = TTY_MAJOR, + .minor_start = 0, + .type = TTY_DRIVER_TYPE_CONSOLE, + .subtype = SYSTEM_TYPE_CONSOLE, + .read_irq = CONSOLE_IRQ, + .read_irq_name = "console", + .write_irq = CONSOLE_WRITE_IRQ, + .write_irq_name = "console-write", + .symlink_from = "ttys", + .symlink_to = "vc", + .mc = { + .name = "con", + .config = con_config, + .get_config = con_get_config, + .remove = con_remove, + }, +}; + +static struct lines console_lines = LINES_INIT(MAX_TTYS); + +/* The array is initialized by line_init, which is an initcall. The + * individual elements are protected by individual semaphores. + */ +struct line vts[MAX_TTYS] = { LINE_INIT(CONFIG_CON_ZERO_CHAN, &driver), + [ 1 ... MAX_TTYS - 1 ] = + LINE_INIT(CONFIG_CON_CHAN, &driver) }; + +static int con_config(char *str) +{ + return(line_config(vts, sizeof(vts)/sizeof(vts[0]), str)); +} + +static int con_get_config(char *dev, char *str, int size, char **error_out) +{ + return(line_get_config(dev, vts, sizeof(vts)/sizeof(vts[0]), str, + size, error_out)); +} + +static int con_remove(char *str) +{ + return(line_remove(vts, sizeof(vts)/sizeof(vts[0]), str)); +} + +static int open_console(struct tty_struct *tty) +{ + return(line_open(vts, tty, &opts)); +} + +static int con_open(struct tty_struct *tty, struct file *filp) +{ + return(open_console(tty)); +} + +static void con_close(struct tty_struct *tty, struct file *filp) +{ + line_close(vts, tty); +} + +static int con_write(struct tty_struct *tty, int from_user, + const unsigned char *buf, int count) +{ + return(line_write(vts, tty, from_user, buf, count)); +} + +static void set_termios(struct tty_struct *tty, struct termios * old) +{ +} + +static int chars_in_buffer(struct tty_struct *tty) +{ + return(0); +} + +static int con_init_done = 0; + +static struct tty_operations console_ops = { + .open = con_open, + .close = con_close, + .write = con_write, + .chars_in_buffer = chars_in_buffer, + .set_termios = set_termios, + .write_room = line_write_room, +}; + +int stdio_init(void) +{ + char *new_title; + + printk(KERN_INFO "Initializing stdio console driver\n"); + + console_driver = line_register_devfs(&console_lines, &driver, + &console_ops, vts, + sizeof(vts)/sizeof(vts[0])); + + lines_init(vts, sizeof(vts)/sizeof(vts[0])); + + new_title = add_xterm_umid(opts.xterm_title); + if(new_title != NULL) opts.xterm_title = new_title; + + open_console(NULL); + con_init_done = 1; + return(0); +} + +__initcall(stdio_init); + +static void console_write(struct console *console, const char *string, + unsigned len) +{ + if(con_init_done) down(&vts[console->index].sem); + console_write_chan(&vts[console->index].chan_list, string, len); + if(con_init_done) up(&vts[console->index].sem); +} + +static struct tty_driver *console_device(struct console *c, int *index) +{ + *index = c->index; + return console_driver; +} + +static int console_setup(struct console *co, char *options) +{ + return(0); +} + +static struct console stdiocons = INIT_CONSOLE("tty", console_write, + console_device, console_setup, + CON_PRINTBUFFER); + +static int __init stdio_console_init(void) +{ + INIT_LIST_HEAD(&vts[0].chan_list); + list_add(&init_console_chan.list, &vts[0].chan_list); + register_console(&stdiocons); + return(0); +} + +console_initcall(stdio_console_init); + +static int console_chan_setup(char *str) +{ + line_setup(vts, sizeof(vts)/sizeof(vts[0]), str, 1); + return(1); +} + +__setup("con", console_chan_setup); +__channel_help(console_chan_setup, "con"); + +static void console_exit(void) +{ + if(!con_init_done) return; + close_lines(vts, sizeof(vts)/sizeof(vts[0])); +} + +__uml_exitcall(console_exit); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c --- a/arch/um/drivers/ubd_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/ubd_kern.c Fri Oct 31 14:10:53 2003 @@ -8,15 +8,23 @@ * old style ubd by setting UBD_SHIFT to 0 * 2002-09-27...2002-10-18 massive tinkering for 2.5 * partitions have changed in 2.5 + * 2003-01-29 more tinkering for 2.5.59-1 + * This should now address the sysfs problems and has + * the symlink for devfs to allow for booting with + * the common /dev/ubd/discX/... names rather than + * only /dev/ubdN/discN this version also has lots of + * clean ups preparing for ubd-many. + * James McMechan */ #define MAJOR_NR UBD_MAJOR -#define UBD_SHIFT 4 +#define UBD_SHIFT 0 #include "linux/config.h" #include "linux/module.h" #include "linux/blkdev.h" #include "linux/hdreg.h" +#include "linux/interrupt.h" #include "linux/init.h" #include "linux/devfs_fs_kernel.h" #include "linux/cdrom.h" @@ -28,6 +36,7 @@ #include "linux/blkpg.h" #include "linux/genhd.h" #include "linux/spinlock.h" +#include "linux/bitops.h" #include "asm/segment.h" #include "asm/uaccess.h" #include "asm/irq.h" @@ -47,7 +56,10 @@ static spinlock_t ubd_io_lock = SPIN_LOCK_UNLOCKED; static spinlock_t ubd_lock = SPIN_LOCK_UNLOCKED; -static void (*do_ubd)(void); +/* We set this when we asked io thread to do some work, + by using this flag we can avoid do_ubd_request to schedule + io more then once for any given request. (race seen on SMP) */ +static long ubd_servicing; static int ubd_open(struct inode * inode, struct file * filp); static int ubd_release(struct inode * inode, struct file * file); @@ -67,7 +79,7 @@ static request_queue_t *ubd_queue; /* Protected by ubd_lock */ -static int fake_major = 0; +static int fake_major = MAJOR_NR; static struct gendisk *ubd_gendisk[MAX_DEV]; static struct gendisk *fake_gendisk[MAX_DEV]; @@ -96,12 +108,12 @@ struct ubd { char *file; - int is_dir; int count; int fd; __u64 size; struct openflags boot_openflags; struct openflags openflags; + int no_cow; struct cow cow; }; @@ -115,12 +127,12 @@ #define DEFAULT_UBD { \ .file = NULL, \ - .is_dir = 0, \ .count = 0, \ .fd = -1, \ .size = -1, \ .boot_openflags = OPEN_FLAGS, \ .openflags = OPEN_FLAGS, \ + .no_cow = 0, \ .cow = DEFAULT_COW, \ } @@ -128,8 +140,10 @@ static int ubd0_init(void) { - if(ubd_dev[0].file == NULL) - ubd_dev[0].file = "root_fs"; + struct ubd *dev = &ubd_dev[0]; + + if(dev->file == NULL) + dev->file = "root_fs"; return(0); } @@ -196,19 +210,39 @@ " Create ide0 entries that map onto ubd devices.\n\n" ); +static int parse_unit(char **ptr) +{ + char *str = *ptr, *end; + int n = -1; + + if(isdigit(*str)) { + n = simple_strtoul(str, &end, 0); + if(end == str) + return(-1); + *ptr = end; + } + else if (('a' <= *str) && (*str <= 'h')) { + n = *str - 'a'; + str++; + *ptr = str; + } + return(n); +} + static int ubd_setup_common(char *str, int *index_out) { + struct ubd *dev; struct openflags flags = global_openflags; char *backing_file; int n, err; if(index_out) *index_out = -1; - n = *str++; + n = *str; if(n == '='){ - static int fake_major_allowed = 1; char *end; int major; + str++; if(!strcmp(str, "sync")){ global_openflags.s = 1; return(0); @@ -220,20 +254,14 @@ return(1); } - if(!fake_major_allowed){ - printk(KERN_ERR "Can't assign a fake major twice\n"); - return(1); - } - err = 1; spin_lock(&ubd_lock); - if(!fake_major_allowed){ + if(fake_major != MAJOR_NR){ printk(KERN_ERR "Can't assign a fake major twice\n"); goto out1; } fake_major = major; - fake_major_allowed = 0; printk(KERN_INFO "Setting extra ubd major number to %d\n", major); @@ -243,25 +271,23 @@ return(err); } - if(n < '0'){ - printk(KERN_ERR "ubd_setup : index out of range\n"); } - - if((n >= '0') && (n <= '9')) n -= '0'; - else if((n >= 'a') && (n <= 'z')) n -= 'a'; - else { - printk(KERN_ERR "ubd_setup : device syntax invalid\n"); + n = parse_unit(&str); + if(n < 0){ + printk(KERN_ERR "ubd_setup : couldn't parse unit number " + "'%s'\n", str); return(1); } if(n >= MAX_DEV){ - printk(KERN_ERR "ubd_setup : index out of range " - "(%d devices)\n", MAX_DEV); + printk(KERN_ERR "ubd_setup : index %d out of range " + "(%d devices)\n", n, MAX_DEV); return(1); } err = 1; spin_lock(&ubd_lock); - if(ubd_dev[n].file != NULL){ + dev = &ubd_dev[n]; + if(dev->file != NULL){ printk(KERN_ERR "ubd_setup : device already configured\n"); goto out2; } @@ -276,6 +302,11 @@ flags.s = 1; str++; } + if (*str == 'd'){ + dev->no_cow = 1; + str++; + } + if(*str++ != '='){ printk(KERN_ERR "ubd_setup : Expected '='\n"); goto out2; @@ -284,14 +315,17 @@ err = 0; backing_file = strchr(str, ','); if(backing_file){ - *backing_file = '\0'; - backing_file++; + if(dev->no_cow) + printk(KERN_ERR "Can't specify both 'd' and a " + "cow file\n"); + else { + *backing_file = '\0'; + backing_file++; + } } - ubd_dev[n].file = str; - if(ubd_is_dir(ubd_dev[n].file)) - ubd_dev[n].is_dir = 1; - ubd_dev[n].cow.file = backing_file; - ubd_dev[n].boot_openflags = flags; + dev->file = str; + dev->cow.file = backing_file; + dev->boot_openflags = flags; out2: spin_unlock(&ubd_lock); return(err); @@ -321,8 +355,7 @@ static int fakehd_set = 0; static int fakehd(char *str) { - printk(KERN_INFO - "fakehd : Changing ubd name to \"hd\".\n"); + printk(KERN_INFO "fakehd : Changing ubd name to \"hd\".\n"); fakehd_set = 1; return 1; } @@ -370,7 +403,6 @@ struct request *rq = elv_next_request(ubd_queue); int n; - do_ubd = NULL; intr_count++; n = read_ubd_fs(thread_fd, &req, sizeof(req)); if(n != sizeof(req)){ @@ -379,6 +411,7 @@ spin_lock(&ubd_io_lock); end_request(rq, 0); spin_unlock(&ubd_io_lock); + clear_bit(1, &ubd_servicing); return; } @@ -387,13 +420,15 @@ panic("I/O op mismatch"); ubd_finish(rq, req.error); + clear_bit(1, &ubd_servicing); reactivate_fd(thread_fd, UBD_IRQ); do_ubd_request(ubd_queue); } -static void ubd_intr(int irq, void *dev, struct pt_regs *unused) +static irqreturn_t ubd_intr(int irq, void *dev, struct pt_regs *unused) { ubd_handler(); + return(IRQ_HANDLED); } /* Only changed by ubd_init, which is an initcall. */ @@ -429,16 +464,18 @@ static int ubd_open_dev(struct ubd *dev) { struct openflags flags; - int err, n, create_cow, *create_ptr; + char **back_ptr; + int err, create_cow, *create_ptr; + dev->openflags = dev->boot_openflags; create_cow = 0; create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL; - dev->fd = open_ubd_file(dev->file, &dev->openflags, &dev->cow.file, + back_ptr = dev->no_cow ? NULL : &dev->cow.file; + dev->fd = open_ubd_file(dev->file, &dev->openflags, back_ptr, &dev->cow.bitmap_offset, &dev->cow.bitmap_len, &dev->cow.data_offset, create_ptr); if((dev->fd == -ENOENT) && create_cow){ - n = dev - ubd_dev; dev->fd = create_cow_file(dev->file, dev->cow.file, dev->openflags, 1 << 9, &dev->cow.bitmap_offset, @@ -455,7 +492,10 @@ if(dev->cow.file != NULL){ err = -ENOMEM; dev->cow.bitmap = (void *) vmalloc(dev->cow.bitmap_len); - if(dev->cow.bitmap == NULL) goto error; + if(dev->cow.bitmap == NULL){ + printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); + goto error; + } flush_tlb_kernel_vm(); err = read_cow_bitmap(dev->fd, dev->cow.bitmap, @@ -481,17 +521,31 @@ { struct gendisk *disk; + char from[sizeof("ubd/nnnnn\0")], to[sizeof("discnnnnn/disc\0")]; + int err; disk = alloc_disk(1 << UBD_SHIFT); - if (!disk) - return -ENOMEM; + if(disk == NULL) + return(-ENOMEM); disk->major = major; disk->first_minor = unit << UBD_SHIFT; disk->fops = &ubd_blops; set_capacity(disk, size / 512); - sprintf(disk->disk_name, "ubd"); - sprintf(disk->devfs_name, "ubd/disc%d", unit); + if(major == MAJOR_NR){ + sprintf(disk->disk_name, "ubd%d", unit); + sprintf(disk->devfs_name, "ubd/disc%d", unit); + sprintf(from, "ubd/%d", unit); + sprintf(to, "disc%d/disc", unit); + err = devfs_mk_symlink(from, to); + if(err) + printk("ubd_new_disk failed to make link from %s to " + "%s, error = %d\n", from, to, err); + } + else { + sprintf(disk->disk_name, "ubd_fake%d", unit); + sprintf(disk->devfs_name, "ubd_fake/disc%d", unit); + } disk->private_data = &ubd_dev[unit]; disk->queue = ubd_queue; @@ -506,10 +560,7 @@ struct ubd *dev = &ubd_dev[n]; int err; - if(dev->is_dir) - return(-EISDIR); - - if (!dev->file) + if(dev->file == NULL) return(-ENODEV); if (ubd_open_dev(dev)) @@ -523,7 +574,7 @@ if(err) return(err); - if(fake_major) + if(fake_major != MAJOR_NR) ubd_new_disk(fake_major, dev->size, n, &fake_gendisk[n]); @@ -561,42 +612,42 @@ return(err); } -static int ubd_get_config(char *dev, char *str, int size, char **error_out) +static int ubd_get_config(char *name, char *str, int size, char **error_out) { - struct ubd *ubd; + struct ubd *dev; char *end; - int major, n = 0; + int n, len = 0; - major = simple_strtoul(dev, &end, 0); - if((*end != '\0') || (end == dev)){ - *error_out = "ubd_get_config : didn't parse major number"; + n = simple_strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ + *error_out = "ubd_get_config : didn't parse device number"; return(-1); } - if((major >= MAX_DEV) || (major < 0)){ - *error_out = "ubd_get_config : major number out of range"; + if((n >= MAX_DEV) || (n < 0)){ + *error_out = "ubd_get_config : device number out of range"; return(-1); } - ubd = &ubd_dev[major]; + dev = &ubd_dev[n]; spin_lock(&ubd_lock); - if(ubd->file == NULL){ - CONFIG_CHUNK(str, size, n, "", 1); + if(dev->file == NULL){ + CONFIG_CHUNK(str, size, len, "", 1); goto out; } - CONFIG_CHUNK(str, size, n, ubd->file, 0); + CONFIG_CHUNK(str, size, len, dev->file, 0); - if(ubd->cow.file != NULL){ - CONFIG_CHUNK(str, size, n, ",", 0); - CONFIG_CHUNK(str, size, n, ubd->cow.file, 1); + if(dev->cow.file != NULL){ + CONFIG_CHUNK(str, size, len, ",", 0); + CONFIG_CHUNK(str, size, len, dev->cow.file, 1); } - else CONFIG_CHUNK(str, size, n, "", 1); + else CONFIG_CHUNK(str, size, len, "", 1); out: spin_unlock(&ubd_lock); - return(n); + return(len); } static int ubd_remove(char *str) @@ -604,11 +655,9 @@ struct ubd *dev; int n, err = -ENODEV; - if(!isdigit(*str)) - return(err); /* it should be a number 0-7/a-h */ + n = parse_unit(&str); - n = *str - '0'; - if(n >= MAX_DEV) + if((n < 0) || (n >= MAX_DEV)) return(err); dev = &ubd_dev[n]; @@ -669,7 +718,7 @@ elevator_init(ubd_queue, &elevator_noop); - if (fake_major != 0) { + if (fake_major != MAJOR_NR) { char name[sizeof("ubd_nnn\0")]; snprintf(name, sizeof(name), "ubd_%d", fake_major); @@ -714,15 +763,9 @@ { struct gendisk *disk = inode->i_bdev->bd_disk; struct ubd *dev = disk->private_data; - int err = -EISDIR; - - if(dev->is_dir == 1) - goto out; + int err = 0; - err = 0; if(dev->count == 0){ - dev->openflags = dev->boot_openflags; - err = ubd_open_dev(dev); if(err){ printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", @@ -796,15 +839,6 @@ if(req->rq_status == RQ_INACTIVE) return(1); - if(dev->is_dir){ - strcpy(req->buffer, "HOSTFS:"); - strcat(req->buffer, dev->file); - spin_lock(&ubd_io_lock); - end_request(req, 1); - spin_unlock(&ubd_io_lock); - return(1); - } - if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ printk("Write attempted on readonly ubd device %s\n", disk->disk_name); @@ -830,6 +864,27 @@ io_req->cow_offset = -1; io_req->error = 0; +//#define TRACE1 1 +#ifdef TRACE1 + if (disk->first_minor >> disk->minor_shift == TRACE1) { + static unsigned long lastaccessed=-2; + static unsigned long written=0; + char *oper; + + switch ( io_req->op ) { + case UBD_READ: oper="READ"; break; + case UBD_WRITE: oper="WRITE"; break; + default: oper="UNKNOWN"; break; + } + if ( lastaccessed + 1 != req->sector) { + printk(KERN_DEBUG "Nonsequential disk %s for sector %ld, len %d, last accessed %ld contig %ld\n", oper, req->sector, nsect, lastaccessed, written); + written=0; + } + lastaccessed=req->sector+nsect-1; + written+=nsect; + } +#endif + if(dev->cow.file != NULL) cowify_req(io_req, dev); return(0); } @@ -851,16 +906,21 @@ } } else { - if(do_ubd || list_empty(&q->queue_head)) return; + /* if there is no requests or if another thread already + already started async io - return */ + if(list_empty(&q->queue_head) || + test_and_set_bit(1, &ubd_servicing)) return; + req = elv_next_request(q); err = prepare_request(req, &io_req); if(!err){ - do_ubd = ubd_handler; n = write_ubd_fs(thread_fd, (char *) &io_req, sizeof(io_req)); if(n != sizeof(io_req)) printk("write to io thread failed, " "errno = %d\n", -n); + } else { + clear_bit(1, &ubd_servicing); } } } diff -Nru a/arch/um/drivers/ubd_kern.c~uml-kill-irq_kern.h.diff b/arch/um/drivers/ubd_kern.c~uml-kill-irq_kern.h.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/ubd_kern.c~uml-kill-irq_kern.h.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,1008 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +/* 2001-09-28...2002-04-17 + * Partition stuff by James_McMechan@hotmail.com + * old style ubd by setting UBD_SHIFT to 0 + * 2002-09-27...2002-10-18 massive tinkering for 2.5 + * partitions have changed in 2.5 + * 2003-01-29 more tinkering for 2.5.59-1 + * This should now address the sysfs problems and has + * the symlink for devfs to allow for booting with + * the common /dev/ubd/discX/... names rather than + * only /dev/ubdN/discN this version also has lots of + * clean ups preparing for ubd-many. + * James McMechan + */ + +#define MAJOR_NR UBD_MAJOR +#define UBD_SHIFT 0 + +#include "linux/config.h" +#include "linux/module.h" +#include "linux/blkdev.h" +#include "linux/hdreg.h" +#include "linux/interrupt.h" +#include "linux/init.h" +#include "linux/devfs_fs_kernel.h" +#include "linux/cdrom.h" +#include "linux/proc_fs.h" +#include "linux/ctype.h" +#include "linux/capability.h" +#include "linux/mm.h" +#include "linux/vmalloc.h" +#include "linux/blkpg.h" +#include "linux/genhd.h" +#include "linux/spinlock.h" +#include "linux/bitops.h" +#include "asm/segment.h" +#include "asm/uaccess.h" +#include "asm/irq.h" +#include "asm/types.h" +#include "asm/tlbflush.h" +#include "user_util.h" +#include "mem_user.h" +#include "kern_util.h" +#include "kern.h" +#include "mconsole_kern.h" +#include "init.h" +#include "irq_user.h" +#include "irq_kern.h" +#include "ubd_user.h" +#include "2_5compat.h" +#include "os.h" + +static spinlock_t ubd_io_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t ubd_lock = SPIN_LOCK_UNLOCKED; + +/* We set this when we asked io thread to do some work, + by using this flag we can avoid do_ubd_request to schedule + io more then once for any given request. (race seen on SMP) */ +static long ubd_servicing; + +static int ubd_open(struct inode * inode, struct file * filp); +static int ubd_release(struct inode * inode, struct file * file); +static int ubd_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg); + +#define MAX_DEV (8) + +static struct block_device_operations ubd_blops = { + .owner = THIS_MODULE, + .open = ubd_open, + .release = ubd_release, + .ioctl = ubd_ioctl, +}; + +/* Protected by the queue_lock */ +static request_queue_t *ubd_queue; + +/* Protected by ubd_lock */ +static int fake_major = MAJOR_NR; + +static struct gendisk *ubd_gendisk[MAX_DEV]; +static struct gendisk *fake_gendisk[MAX_DEV]; + +#ifdef CONFIG_BLK_DEV_UBD_SYNC +#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ + .cl = 1 }) +#else +#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 0, .c = 0, \ + .cl = 1 }) +#endif + +/* Not protected - changed only in ubd_setup_common and then only to + * to enable O_SYNC. + */ +static struct openflags global_openflags = OPEN_FLAGS; + +struct cow { + char *file; + int fd; + unsigned long *bitmap; + unsigned long bitmap_len; + int bitmap_offset; + int data_offset; +}; + +struct ubd { + char *file; + int count; + int fd; + __u64 size; + struct openflags boot_openflags; + struct openflags openflags; + int no_cow; + struct cow cow; +}; + +#define DEFAULT_COW { \ + .file = NULL, \ + .fd = -1, \ + .bitmap = NULL, \ + .bitmap_offset = 0, \ + .data_offset = 0, \ +} + +#define DEFAULT_UBD { \ + .file = NULL, \ + .count = 0, \ + .fd = -1, \ + .size = -1, \ + .boot_openflags = OPEN_FLAGS, \ + .openflags = OPEN_FLAGS, \ + .no_cow = 0, \ + .cow = DEFAULT_COW, \ +} + +struct ubd ubd_dev[MAX_DEV] = { [ 0 ... MAX_DEV - 1 ] = DEFAULT_UBD }; + +static int ubd0_init(void) +{ + struct ubd *dev = &ubd_dev[0]; + + if(dev->file == NULL) + dev->file = "root_fs"; + return(0); +} + +__initcall(ubd0_init); + +/* Only changed by fake_ide_setup which is a setup */ +static int fake_ide = 0; +static struct proc_dir_entry *proc_ide_root = NULL; +static struct proc_dir_entry *proc_ide = NULL; + +static void make_proc_ide(void) +{ + proc_ide_root = proc_mkdir("ide", 0); + proc_ide = proc_mkdir("ide0", proc_ide_root); +} + +static int proc_ide_read_media(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len; + + strcpy(page, "disk\n"); + len = strlen("disk\n"); + len -= off; + if (len < count){ + *eof = 1; + if (len <= 0) return 0; + } + else len = count; + *start = page + off; + return len; +} + +static void make_ide_entries(char *dev_name) +{ + struct proc_dir_entry *dir, *ent; + char name[64]; + + if(proc_ide_root == NULL) make_proc_ide(); + + dir = proc_mkdir(dev_name, proc_ide); + if(!dir) return; + + ent = create_proc_entry("media", S_IFREG|S_IRUGO, dir); + if(!ent) return; + ent->nlink = 1; + ent->data = NULL; + ent->read_proc = proc_ide_read_media; + ent->write_proc = NULL; + sprintf(name,"ide0/%s", dev_name); + proc_symlink(dev_name, proc_ide_root, name); +} + +static int fake_ide_setup(char *str) +{ + fake_ide = 1; + return(1); +} + +__setup("fake_ide", fake_ide_setup); + +__uml_help(fake_ide_setup, +"fake_ide\n" +" Create ide0 entries that map onto ubd devices.\n\n" +); + +static int parse_unit(char **ptr) +{ + char *str = *ptr, *end; + int n = -1; + + if(isdigit(*str)) { + n = simple_strtoul(str, &end, 0); + if(end == str) + return(-1); + *ptr = end; + } + else if (('a' <= *str) && (*str <= 'h')) { + n = *str - 'a'; + str++; + *ptr = str; + } + return(n); +} + +static int ubd_setup_common(char *str, int *index_out) +{ + struct ubd *dev; + struct openflags flags = global_openflags; + char *backing_file; + int n, err; + + if(index_out) *index_out = -1; + n = *str; + if(n == '='){ + char *end; + int major; + + str++; + if(!strcmp(str, "sync")){ + global_openflags.s = 1; + return(0); + } + major = simple_strtoul(str, &end, 0); + if((*end != '\0') || (end == str)){ + printk(KERN_ERR + "ubd_setup : didn't parse major number\n"); + return(1); + } + + err = 1; + spin_lock(&ubd_lock); + if(fake_major != MAJOR_NR){ + printk(KERN_ERR "Can't assign a fake major twice\n"); + goto out1; + } + + fake_major = major; + + printk(KERN_INFO "Setting extra ubd major number to %d\n", + major); + err = 0; + out1: + spin_unlock(&ubd_lock); + return(err); + } + + n = parse_unit(&str); + if(n < 0){ + printk(KERN_ERR "ubd_setup : couldn't parse unit number " + "'%s'\n", str); + return(1); + } + if(n >= MAX_DEV){ + printk(KERN_ERR "ubd_setup : index %d out of range " + "(%d devices)\n", n, MAX_DEV); + return(1); + } + + err = 1; + spin_lock(&ubd_lock); + + dev = &ubd_dev[n]; + if(dev->file != NULL){ + printk(KERN_ERR "ubd_setup : device already configured\n"); + goto out2; + } + + if(index_out) *index_out = n; + + if (*str == 'r'){ + flags.w = 0; + str++; + } + if (*str == 's'){ + flags.s = 1; + str++; + } + if (*str == 'd'){ + dev->no_cow = 1; + str++; + } + + if(*str++ != '='){ + printk(KERN_ERR "ubd_setup : Expected '='\n"); + goto out2; + } + + err = 0; + backing_file = strchr(str, ','); + if(backing_file){ + if(dev->no_cow) + printk(KERN_ERR "Can't specify both 'd' and a " + "cow file\n"); + else { + *backing_file = '\0'; + backing_file++; + } + } + dev->file = str; + dev->cow.file = backing_file; + dev->boot_openflags = flags; + out2: + spin_unlock(&ubd_lock); + return(err); +} + +static int ubd_setup(char *str) +{ + ubd_setup_common(str, NULL); + return(1); +} + +__setup("ubd", ubd_setup); +__uml_help(ubd_setup, +"ubd=\n" +" This is used to associate a device with a file in the underlying\n" +" filesystem. Usually, there is a filesystem in the file, but \n" +" that's not required. Swap devices containing swap files can be\n" +" specified like this. Also, a file which doesn't contain a\n" +" filesystem can have its contents read in the virtual \n" +" machine by running dd on the device. n must be in the range\n" +" 0 to 7. Appending an 'r' to the number will cause that device\n" +" to be mounted read-only. For example ubd1r=./ext_fs. Appending\n" +" an 's' (has to be _after_ 'r', if there is one) will cause data\n" +" to be written to disk on the host immediately.\n\n" +); + +static int fakehd_set = 0; +static int fakehd(char *str) +{ + printk(KERN_INFO "fakehd : Changing ubd name to \"hd\".\n"); + fakehd_set = 1; + return 1; +} + +__setup("fakehd", fakehd); +__uml_help(fakehd, +"fakehd\n" +" Change the ubd device name to \"hd\".\n\n" +); + +static void do_ubd_request(request_queue_t * q); + +/* Only changed by ubd_init, which is an initcall. */ +int thread_fd = -1; + +/* Changed by ubd_handler, which is serialized because interrupts only + * happen on CPU 0. + */ +int intr_count = 0; + +static void ubd_finish(struct request *req, int error) +{ + int nsect; + + if(error){ + spin_lock(&ubd_io_lock); + end_request(req, 0); + spin_unlock(&ubd_io_lock); + return; + } + nsect = req->current_nr_sectors; + req->sector += nsect; + req->buffer += nsect << 9; + req->errors = 0; + req->nr_sectors -= nsect; + req->current_nr_sectors = 0; + spin_lock(&ubd_io_lock); + end_request(req, 1); + spin_unlock(&ubd_io_lock); +} + +static void ubd_handler(void) +{ + struct io_thread_req req; + struct request *rq = elv_next_request(ubd_queue); + int n; + + intr_count++; + n = read_ubd_fs(thread_fd, &req, sizeof(req)); + if(n != sizeof(req)){ + printk(KERN_ERR "Pid %d - spurious interrupt in ubd_handler, " + "errno = %d\n", os_getpid(), -n); + spin_lock(&ubd_io_lock); + end_request(rq, 0); + spin_unlock(&ubd_io_lock); + clear_bit(1, &ubd_servicing); + return; + } + + if((req.offset != ((__u64) (rq->sector)) << 9) || + (req.length != (rq->current_nr_sectors) << 9)) + panic("I/O op mismatch"); + + ubd_finish(rq, req.error); + clear_bit(1, &ubd_servicing); + reactivate_fd(thread_fd, UBD_IRQ); + do_ubd_request(ubd_queue); +} + +static irqreturn_t ubd_intr(int irq, void *dev, struct pt_regs *unused) +{ + ubd_handler(); + return(IRQ_HANDLED); +} + +/* Only changed by ubd_init, which is an initcall. */ +static int io_pid = -1; + +void kill_io_thread(void) +{ + if(io_pid != -1) + os_kill_process(io_pid, 1); +} + +__uml_exitcall(kill_io_thread); + +static int ubd_file_size(struct ubd *dev, __u64 *size_out) +{ + char *file; + + file = dev->cow.file ? dev->cow.file : dev->file; + return(os_file_size(file, size_out)); +} + +static void ubd_close(struct ubd *dev) +{ + os_close_file(dev->fd); + if(dev->cow.file == NULL) + return; + + os_close_file(dev->cow.fd); + vfree(dev->cow.bitmap); + dev->cow.bitmap = NULL; +} + +static int ubd_open_dev(struct ubd *dev) +{ + struct openflags flags; + char **back_ptr; + int err, create_cow, *create_ptr; + + dev->openflags = dev->boot_openflags; + create_cow = 0; + create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL; + back_ptr = dev->no_cow ? NULL : &dev->cow.file; + dev->fd = open_ubd_file(dev->file, &dev->openflags, back_ptr, + &dev->cow.bitmap_offset, &dev->cow.bitmap_len, + &dev->cow.data_offset, create_ptr); + + if((dev->fd == -ENOENT) && create_cow){ + dev->fd = create_cow_file(dev->file, dev->cow.file, + dev->openflags, 1 << 9, + &dev->cow.bitmap_offset, + &dev->cow.bitmap_len, + &dev->cow.data_offset); + if(dev->fd >= 0){ + printk(KERN_INFO "Creating \"%s\" as COW file for " + "\"%s\"\n", dev->file, dev->cow.file); + } + } + + if(dev->fd < 0) return(dev->fd); + + if(dev->cow.file != NULL){ + err = -ENOMEM; + dev->cow.bitmap = (void *) vmalloc(dev->cow.bitmap_len); + if(dev->cow.bitmap == NULL){ + printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); + goto error; + } + flush_tlb_kernel_vm(); + + err = read_cow_bitmap(dev->fd, dev->cow.bitmap, + dev->cow.bitmap_offset, + dev->cow.bitmap_len); + if(err) goto error; + + flags = dev->openflags; + flags.w = 0; + err = open_ubd_file(dev->cow.file, &flags, NULL, NULL, NULL, + NULL, NULL); + if(err < 0) goto error; + dev->cow.fd = err; + } + return(0); + error: + os_close_file(dev->fd); + return(err); +} + +static int ubd_new_disk(int major, u64 size, int unit, + struct gendisk **disk_out) + +{ + struct gendisk *disk; + char from[sizeof("ubd/nnnnn\0")], to[sizeof("discnnnnn/disc\0")]; + int err; + + disk = alloc_disk(1 << UBD_SHIFT); + if(disk == NULL) + return(-ENOMEM); + + disk->major = major; + disk->first_minor = unit << UBD_SHIFT; + disk->fops = &ubd_blops; + set_capacity(disk, size / 512); + if(major == MAJOR_NR){ + sprintf(disk->disk_name, "ubd%d", unit); + sprintf(disk->devfs_name, "ubd/disc%d", unit); + sprintf(from, "ubd/%d", unit); + sprintf(to, "disc%d/disc", unit); + err = devfs_mk_symlink(from, to); + if(err) + printk("ubd_new_disk failed to make link from %s to " + "%s, error = %d\n", from, to, err); + } + else { + sprintf(disk->disk_name, "ubd_fake%d", unit); + sprintf(disk->devfs_name, "ubd_fake/disc%d", unit); + } + + disk->private_data = &ubd_dev[unit]; + disk->queue = ubd_queue; + add_disk(disk); + + *disk_out = disk; + return 0; +} + +static int ubd_add(int n) +{ + struct ubd *dev = &ubd_dev[n]; + int err; + + if(dev->file == NULL) + return(-ENODEV); + + if (ubd_open_dev(dev)) + return(-ENODEV); + + err = ubd_file_size(dev, &dev->size); + if(err) + return(err); + + err = ubd_new_disk(MAJOR_NR, dev->size, n, &ubd_gendisk[n]); + if(err) + return(err); + + if(fake_major != MAJOR_NR) + ubd_new_disk(fake_major, dev->size, n, + &fake_gendisk[n]); + + /* perhaps this should also be under the "if (fake_major)" above */ + /* using the fake_disk->disk_name and also the fakehd_set name */ + if (fake_ide) + make_ide_entries(ubd_gendisk[n]->disk_name); + + ubd_close(dev); + return 0; +} + +static int ubd_config(char *str) +{ + int n, err; + + str = uml_strdup(str); + if(str == NULL){ + printk(KERN_ERR "ubd_config failed to strdup string\n"); + return(1); + } + err = ubd_setup_common(str, &n); + if(err){ + kfree(str); + return(-1); + } + if(n == -1) return(0); + + spin_lock(&ubd_lock); + err = ubd_add(n); + if(err) + ubd_dev[n].file = NULL; + spin_unlock(&ubd_lock); + + return(err); +} + +static int ubd_get_config(char *name, char *str, int size, char **error_out) +{ + struct ubd *dev; + char *end; + int n, len = 0; + + n = simple_strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ + *error_out = "ubd_get_config : didn't parse device number"; + return(-1); + } + + if((n >= MAX_DEV) || (n < 0)){ + *error_out = "ubd_get_config : device number out of range"; + return(-1); + } + + dev = &ubd_dev[n]; + spin_lock(&ubd_lock); + + if(dev->file == NULL){ + CONFIG_CHUNK(str, size, len, "", 1); + goto out; + } + + CONFIG_CHUNK(str, size, len, dev->file, 0); + + if(dev->cow.file != NULL){ + CONFIG_CHUNK(str, size, len, ",", 0); + CONFIG_CHUNK(str, size, len, dev->cow.file, 1); + } + else CONFIG_CHUNK(str, size, len, "", 1); + + out: + spin_unlock(&ubd_lock); + return(len); +} + +static int ubd_remove(char *str) +{ + struct ubd *dev; + int n, err = -ENODEV; + + n = parse_unit(&str); + + if((n < 0) || (n >= MAX_DEV)) + return(err); + + dev = &ubd_dev[n]; + if(dev->count > 0) + return(-EBUSY); /* you cannot remove a open disk */ + + err = 0; + spin_lock(&ubd_lock); + + if(ubd_gendisk[n] == NULL) + goto out; + + del_gendisk(ubd_gendisk[n]); + put_disk(ubd_gendisk[n]); + ubd_gendisk[n] = NULL; + + if(fake_gendisk[n] != NULL){ + del_gendisk(fake_gendisk[n]); + put_disk(fake_gendisk[n]); + fake_gendisk[n] = NULL; + } + + *dev = ((struct ubd) DEFAULT_UBD); + err = 0; + out: + spin_unlock(&ubd_lock); + return(err); +} + +static struct mc_device ubd_mc = { + .name = "ubd", + .config = ubd_config, + .get_config = ubd_get_config, + .remove = ubd_remove, +}; + +static int ubd_mc_init(void) +{ + mconsole_register_dev(&ubd_mc); + return 0; +} + +__initcall(ubd_mc_init); + +int ubd_init(void) +{ + int i; + + devfs_mk_dir("ubd"); + if (register_blkdev(MAJOR_NR, "ubd")) + return -1; + + ubd_queue = blk_init_queue(do_ubd_request, &ubd_io_lock); + if (!ubd_queue) { + unregister_blkdev(MAJOR_NR, "ubd"); + return -1; + } + + elevator_init(ubd_queue, &elevator_noop); + + if (fake_major != MAJOR_NR) { + char name[sizeof("ubd_nnn\0")]; + + snprintf(name, sizeof(name), "ubd_%d", fake_major); + devfs_mk_dir(name); + if (register_blkdev(fake_major, "ubd")) + return -1; + } + for (i = 0; i < MAX_DEV; i++) + ubd_add(i); + return 0; +} + +late_initcall(ubd_init); + +int ubd_driver_init(void){ + unsigned long stack; + int err; + + if(global_openflags.s){ + printk(KERN_INFO "ubd : Synchronous mode\n"); + return(0); + } + stack = alloc_stack(0, 0); + io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), + &thread_fd); + if(io_pid < 0){ + printk(KERN_ERR + "ubd : Failed to start I/O thread (errno = %d) - " + "falling back to synchronous I/O\n", -io_pid); + return(0); + } + err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, + SA_INTERRUPT, "ubd", ubd_dev); + if(err != 0) printk(KERN_ERR + "um_request_irq failed - errno = %d\n", -err); + return(err); +} + +device_initcall(ubd_driver_init); + +static int ubd_open(struct inode *inode, struct file *filp) +{ + struct gendisk *disk = inode->i_bdev->bd_disk; + struct ubd *dev = disk->private_data; + int err = 0; + + if(dev->count == 0){ + err = ubd_open_dev(dev); + if(err){ + printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", + disk->disk_name, dev->file, -err); + goto out; + } + } + dev->count++; + if((filp->f_mode & FMODE_WRITE) && !dev->openflags.w){ + if(--dev->count == 0) ubd_close(dev); + err = -EROFS; + } + out: + return(err); +} + +static int ubd_release(struct inode * inode, struct file * file) +{ + struct gendisk *disk = inode->i_bdev->bd_disk; + struct ubd *dev = disk->private_data; + + if(--dev->count == 0) + ubd_close(dev); + return(0); +} + +void cowify_req(struct io_thread_req *req, struct ubd *dev) +{ + int i, update_bitmap, sector = req->offset >> 9; + + if(req->length > (sizeof(req->sector_mask) * 8) << 9) + panic("Operation too long"); + if(req->op == UBD_READ) { + for(i = 0; i < req->length >> 9; i++){ + if(ubd_test_bit(sector + i, (unsigned char *) + dev->cow.bitmap)){ + ubd_set_bit(i, (unsigned char *) + &req->sector_mask); + } + } + } + else { + update_bitmap = 0; + for(i = 0; i < req->length >> 9; i++){ + ubd_set_bit(i, (unsigned char *) + &req->sector_mask); + if(!ubd_test_bit(sector + i, (unsigned char *) + dev->cow.bitmap)) + update_bitmap = 1; + ubd_set_bit(sector + i, (unsigned char *) + dev->cow.bitmap); + } + if(update_bitmap){ + req->cow_offset = sector / (sizeof(unsigned long) * 8); + req->bitmap_words[0] = + dev->cow.bitmap[req->cow_offset]; + req->bitmap_words[1] = + dev->cow.bitmap[req->cow_offset + 1]; + req->cow_offset *= sizeof(unsigned long); + req->cow_offset += dev->cow.bitmap_offset; + } + } +} + +static int prepare_request(struct request *req, struct io_thread_req *io_req) +{ + struct gendisk *disk = req->rq_disk; + struct ubd *dev = disk->private_data; + __u64 block; + int nsect; + + if(req->rq_status == RQ_INACTIVE) return(1); + + if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ + printk("Write attempted on readonly ubd device %s\n", + disk->disk_name); + spin_lock(&ubd_io_lock); + end_request(req, 0); + spin_unlock(&ubd_io_lock); + return(1); + } + + block = req->sector; + nsect = req->current_nr_sectors; + + io_req->op = rq_data_dir(req) == READ ? UBD_READ : UBD_WRITE; + io_req->fds[0] = (dev->cow.file != NULL) ? dev->cow.fd : dev->fd; + io_req->fds[1] = dev->fd; + io_req->offsets[0] = 0; + io_req->offsets[1] = dev->cow.data_offset; + io_req->offset = ((__u64) block) << 9; + io_req->length = nsect << 9; + io_req->buffer = req->buffer; + io_req->sectorsize = 1 << 9; + io_req->sector_mask = 0; + io_req->cow_offset = -1; + io_req->error = 0; + +//#define TRACE1 1 +#ifdef TRACE1 + if (disk->first_minor >> disk->minor_shift == TRACE1) { + static unsigned long lastaccessed=-2; + static unsigned long written=0; + char *oper; + + switch ( io_req->op ) { + case UBD_READ: oper="READ"; break; + case UBD_WRITE: oper="WRITE"; break; + default: oper="UNKNOWN"; break; + } + if ( lastaccessed + 1 != req->sector) { + printk(KERN_DEBUG "Nonsequential disk %s for sector %ld, len %d, last accessed %ld contig %ld\n", oper, req->sector, nsect, lastaccessed, written); + written=0; + } + lastaccessed=req->sector+nsect-1; + written+=nsect; + } +#endif + + if(dev->cow.file != NULL) cowify_req(io_req, dev); + return(0); +} + +static void do_ubd_request(request_queue_t *q) +{ + struct io_thread_req io_req; + struct request *req; + int err, n; + + if(thread_fd == -1){ + while(!list_empty(&q->queue_head)){ + req = elv_next_request(q); + err = prepare_request(req, &io_req); + if(!err){ + do_io(&io_req); + ubd_finish(req, io_req.error); + } + } + } + else { + /* if there is no requests or if another thread already + already started async io - return */ + if(list_empty(&q->queue_head) || + test_and_set_bit(1, &ubd_servicing)) return; + + req = elv_next_request(q); + err = prepare_request(req, &io_req); + if(!err){ + n = write_ubd_fs(thread_fd, (char *) &io_req, + sizeof(io_req)); + if(n != sizeof(io_req)) + printk("write to io thread failed, " + "errno = %d\n", -n); + } else { + clear_bit(1, &ubd_servicing); + } + } +} + +static int ubd_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg) +{ + struct hd_geometry *loc = (struct hd_geometry *) arg; + struct ubd *dev = inode->i_bdev->bd_disk->private_data; + int err; + struct hd_driveid ubd_id = { + .cyls = 0, + .heads = 128, + .sectors = 32, + }; + + switch (cmd) { + struct hd_geometry g; + struct cdrom_volctrl volume; + case HDIO_GETGEO: + if(!loc) return(-EINVAL); + g.heads = 128; + g.sectors = 32; + g.cylinders = dev->size / (128 * 32 * 512); + g.start = 2; + return(copy_to_user(loc, &g, sizeof(g)) ? -EFAULT : 0); + + case HDIO_SET_UNMASKINTR: + if(!capable(CAP_SYS_ADMIN)) return(-EACCES); + if((arg > 1) || (inode->i_bdev->bd_contains != inode->i_bdev)) + return(-EINVAL); + return(0); + + case HDIO_GET_UNMASKINTR: + if(!arg) return(-EINVAL); + err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); + if(err) + return(err); + return(0); + + case HDIO_GET_MULTCOUNT: + if(!arg) return(-EINVAL); + err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); + if(err) + return(err); + return(0); + + case HDIO_SET_MULTCOUNT: + if(!capable(CAP_SYS_ADMIN)) return(-EACCES); + if(inode->i_bdev->bd_contains != inode->i_bdev) + return(-EINVAL); + return(0); + + case HDIO_GET_IDENTITY: + ubd_id.cyls = dev->size / (128 * 32 * 512); + if(copy_to_user((char *) arg, (char *) &ubd_id, + sizeof(ubd_id))) + return(-EFAULT); + return(0); + + case CDROMVOLREAD: + if(copy_from_user(&volume, (char *) arg, sizeof(volume))) + return(-EFAULT); + volume.channel0 = 255; + volume.channel1 = 255; + volume.channel2 = 255; + volume.channel3 = 255; + if(copy_to_user((char *) arg, &volume, sizeof(volume))) + return(-EFAULT); + return(0); + } + return(-EINVAL); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/ubd_kern.c~uml-summa.diff b/arch/um/drivers/ubd_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/ubd_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,947 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +/* 2001-09-28...2002-04-17 + * Partition stuff by James_McMechan@hotmail.com + * old style ubd by setting UBD_SHIFT to 0 + * 2002-09-27...2002-10-18 massive tinkering for 2.5 + * partitions have changed in 2.5 + */ + +#define MAJOR_NR UBD_MAJOR +#define UBD_SHIFT 4 + +#include "linux/config.h" +#include "linux/module.h" +#include "linux/blkdev.h" +#include "linux/hdreg.h" +#include "linux/init.h" +#include "linux/devfs_fs_kernel.h" +#include "linux/cdrom.h" +#include "linux/proc_fs.h" +#include "linux/ctype.h" +#include "linux/capability.h" +#include "linux/mm.h" +#include "linux/vmalloc.h" +#include "linux/blkpg.h" +#include "linux/genhd.h" +#include "linux/spinlock.h" +#include "asm/segment.h" +#include "asm/uaccess.h" +#include "asm/irq.h" +#include "asm/types.h" +#include "asm/tlbflush.h" +#include "user_util.h" +#include "mem_user.h" +#include "kern_util.h" +#include "kern.h" +#include "mconsole_kern.h" +#include "init.h" +#include "irq_user.h" +#include "ubd_user.h" +#include "2_5compat.h" +#include "os.h" + +static spinlock_t ubd_io_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t ubd_lock = SPIN_LOCK_UNLOCKED; + +static void (*do_ubd)(void); + +static int ubd_open(struct inode * inode, struct file * filp); +static int ubd_release(struct inode * inode, struct file * file); +static int ubd_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg); + +#define MAX_DEV (8) + +static struct block_device_operations ubd_blops = { + .owner = THIS_MODULE, + .open = ubd_open, + .release = ubd_release, + .ioctl = ubd_ioctl, +}; + +/* Protected by the queue_lock */ +static request_queue_t *ubd_queue; + +/* Protected by ubd_lock */ +static int fake_major = 0; + +static struct gendisk *ubd_gendisk[MAX_DEV]; +static struct gendisk *fake_gendisk[MAX_DEV]; + +#ifdef CONFIG_BLK_DEV_UBD_SYNC +#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ + .cl = 1 }) +#else +#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 0, .c = 0, \ + .cl = 1 }) +#endif + +/* Not protected - changed only in ubd_setup_common and then only to + * to enable O_SYNC. + */ +static struct openflags global_openflags = OPEN_FLAGS; + +struct cow { + char *file; + int fd; + unsigned long *bitmap; + unsigned long bitmap_len; + int bitmap_offset; + int data_offset; +}; + +struct ubd { + char *file; + int is_dir; + int count; + int fd; + __u64 size; + struct openflags boot_openflags; + struct openflags openflags; + struct cow cow; +}; + +#define DEFAULT_COW { \ + .file = NULL, \ + .fd = -1, \ + .bitmap = NULL, \ + .bitmap_offset = 0, \ + .data_offset = 0, \ +} + +#define DEFAULT_UBD { \ + .file = NULL, \ + .is_dir = 0, \ + .count = 0, \ + .fd = -1, \ + .size = -1, \ + .boot_openflags = OPEN_FLAGS, \ + .openflags = OPEN_FLAGS, \ + .cow = DEFAULT_COW, \ +} + +struct ubd ubd_dev[MAX_DEV] = { [ 0 ... MAX_DEV - 1 ] = DEFAULT_UBD }; + +static int ubd0_init(void) +{ + if(ubd_dev[0].file == NULL) + ubd_dev[0].file = "root_fs"; + return(0); +} + +__initcall(ubd0_init); + +/* Only changed by fake_ide_setup which is a setup */ +static int fake_ide = 0; +static struct proc_dir_entry *proc_ide_root = NULL; +static struct proc_dir_entry *proc_ide = NULL; + +static void make_proc_ide(void) +{ + proc_ide_root = proc_mkdir("ide", 0); + proc_ide = proc_mkdir("ide0", proc_ide_root); +} + +static int proc_ide_read_media(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int len; + + strcpy(page, "disk\n"); + len = strlen("disk\n"); + len -= off; + if (len < count){ + *eof = 1; + if (len <= 0) return 0; + } + else len = count; + *start = page + off; + return len; +} + +static void make_ide_entries(char *dev_name) +{ + struct proc_dir_entry *dir, *ent; + char name[64]; + + if(proc_ide_root == NULL) make_proc_ide(); + + dir = proc_mkdir(dev_name, proc_ide); + if(!dir) return; + + ent = create_proc_entry("media", S_IFREG|S_IRUGO, dir); + if(!ent) return; + ent->nlink = 1; + ent->data = NULL; + ent->read_proc = proc_ide_read_media; + ent->write_proc = NULL; + sprintf(name,"ide0/%s", dev_name); + proc_symlink(dev_name, proc_ide_root, name); +} + +static int fake_ide_setup(char *str) +{ + fake_ide = 1; + return(1); +} + +__setup("fake_ide", fake_ide_setup); + +__uml_help(fake_ide_setup, +"fake_ide\n" +" Create ide0 entries that map onto ubd devices.\n\n" +); + +static int ubd_setup_common(char *str, int *index_out) +{ + struct openflags flags = global_openflags; + char *backing_file; + int n, err; + + if(index_out) *index_out = -1; + n = *str++; + if(n == '='){ + static int fake_major_allowed = 1; + char *end; + int major; + + if(!strcmp(str, "sync")){ + global_openflags.s = 1; + return(0); + } + major = simple_strtoul(str, &end, 0); + if((*end != '\0') || (end == str)){ + printk(KERN_ERR + "ubd_setup : didn't parse major number\n"); + return(1); + } + + if(!fake_major_allowed){ + printk(KERN_ERR "Can't assign a fake major twice\n"); + return(1); + } + + err = 1; + spin_lock(&ubd_lock); + if(!fake_major_allowed){ + printk(KERN_ERR "Can't assign a fake major twice\n"); + goto out1; + } + + fake_major = major; + fake_major_allowed = 0; + + printk(KERN_INFO "Setting extra ubd major number to %d\n", + major); + err = 0; + out1: + spin_unlock(&ubd_lock); + return(err); + } + + if(n < '0'){ + printk(KERN_ERR "ubd_setup : index out of range\n"); } + + if((n >= '0') && (n <= '9')) n -= '0'; + else if((n >= 'a') && (n <= 'z')) n -= 'a'; + else { + printk(KERN_ERR "ubd_setup : device syntax invalid\n"); + return(1); + } + if(n >= MAX_DEV){ + printk(KERN_ERR "ubd_setup : index out of range " + "(%d devices)\n", MAX_DEV); + return(1); + } + + err = 1; + spin_lock(&ubd_lock); + + if(ubd_dev[n].file != NULL){ + printk(KERN_ERR "ubd_setup : device already configured\n"); + goto out2; + } + + if(index_out) *index_out = n; + + if (*str == 'r'){ + flags.w = 0; + str++; + } + if (*str == 's'){ + flags.s = 1; + str++; + } + if(*str++ != '='){ + printk(KERN_ERR "ubd_setup : Expected '='\n"); + goto out2; + } + + err = 0; + backing_file = strchr(str, ','); + if(backing_file){ + *backing_file = '\0'; + backing_file++; + } + ubd_dev[n].file = str; + if(ubd_is_dir(ubd_dev[n].file)) + ubd_dev[n].is_dir = 1; + ubd_dev[n].cow.file = backing_file; + ubd_dev[n].boot_openflags = flags; + out2: + spin_unlock(&ubd_lock); + return(err); +} + +static int ubd_setup(char *str) +{ + ubd_setup_common(str, NULL); + return(1); +} + +__setup("ubd", ubd_setup); +__uml_help(ubd_setup, +"ubd=\n" +" This is used to associate a device with a file in the underlying\n" +" filesystem. Usually, there is a filesystem in the file, but \n" +" that's not required. Swap devices containing swap files can be\n" +" specified like this. Also, a file which doesn't contain a\n" +" filesystem can have its contents read in the virtual \n" +" machine by running dd on the device. n must be in the range\n" +" 0 to 7. Appending an 'r' to the number will cause that device\n" +" to be mounted read-only. For example ubd1r=./ext_fs. Appending\n" +" an 's' (has to be _after_ 'r', if there is one) will cause data\n" +" to be written to disk on the host immediately.\n\n" +); + +static int fakehd_set = 0; +static int fakehd(char *str) +{ + printk(KERN_INFO + "fakehd : Changing ubd name to \"hd\".\n"); + fakehd_set = 1; + return 1; +} + +__setup("fakehd", fakehd); +__uml_help(fakehd, +"fakehd\n" +" Change the ubd device name to \"hd\".\n\n" +); + +static void do_ubd_request(request_queue_t * q); + +/* Only changed by ubd_init, which is an initcall. */ +int thread_fd = -1; + +/* Changed by ubd_handler, which is serialized because interrupts only + * happen on CPU 0. + */ +int intr_count = 0; + +static void ubd_finish(struct request *req, int error) +{ + int nsect; + + if(error){ + spin_lock(&ubd_io_lock); + end_request(req, 0); + spin_unlock(&ubd_io_lock); + return; + } + nsect = req->current_nr_sectors; + req->sector += nsect; + req->buffer += nsect << 9; + req->errors = 0; + req->nr_sectors -= nsect; + req->current_nr_sectors = 0; + spin_lock(&ubd_io_lock); + end_request(req, 1); + spin_unlock(&ubd_io_lock); +} + +static void ubd_handler(void) +{ + struct io_thread_req req; + struct request *rq = elv_next_request(ubd_queue); + int n; + + do_ubd = NULL; + intr_count++; + n = read_ubd_fs(thread_fd, &req, sizeof(req)); + if(n != sizeof(req)){ + printk(KERN_ERR "Pid %d - spurious interrupt in ubd_handler, " + "errno = %d\n", os_getpid(), -n); + spin_lock(&ubd_io_lock); + end_request(rq, 0); + spin_unlock(&ubd_io_lock); + return; + } + + if((req.offset != ((__u64) (rq->sector)) << 9) || + (req.length != (rq->current_nr_sectors) << 9)) + panic("I/O op mismatch"); + + ubd_finish(rq, req.error); + reactivate_fd(thread_fd, UBD_IRQ); + do_ubd_request(ubd_queue); +} + +static void ubd_intr(int irq, void *dev, struct pt_regs *unused) +{ + ubd_handler(); +} + +/* Only changed by ubd_init, which is an initcall. */ +static int io_pid = -1; + +void kill_io_thread(void) +{ + if(io_pid != -1) + os_kill_process(io_pid, 1); +} + +__uml_exitcall(kill_io_thread); + +static int ubd_file_size(struct ubd *dev, __u64 *size_out) +{ + char *file; + + file = dev->cow.file ? dev->cow.file : dev->file; + return(os_file_size(file, size_out)); +} + +static void ubd_close(struct ubd *dev) +{ + os_close_file(dev->fd); + if(dev->cow.file == NULL) + return; + + os_close_file(dev->cow.fd); + vfree(dev->cow.bitmap); + dev->cow.bitmap = NULL; +} + +static int ubd_open_dev(struct ubd *dev) +{ + struct openflags flags; + int err, n, create_cow, *create_ptr; + + create_cow = 0; + create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL; + dev->fd = open_ubd_file(dev->file, &dev->openflags, &dev->cow.file, + &dev->cow.bitmap_offset, &dev->cow.bitmap_len, + &dev->cow.data_offset, create_ptr); + + if((dev->fd == -ENOENT) && create_cow){ + n = dev - ubd_dev; + dev->fd = create_cow_file(dev->file, dev->cow.file, + dev->openflags, 1 << 9, + &dev->cow.bitmap_offset, + &dev->cow.bitmap_len, + &dev->cow.data_offset); + if(dev->fd >= 0){ + printk(KERN_INFO "Creating \"%s\" as COW file for " + "\"%s\"\n", dev->file, dev->cow.file); + } + } + + if(dev->fd < 0) return(dev->fd); + + if(dev->cow.file != NULL){ + err = -ENOMEM; + dev->cow.bitmap = (void *) vmalloc(dev->cow.bitmap_len); + if(dev->cow.bitmap == NULL) goto error; + flush_tlb_kernel_vm(); + + err = read_cow_bitmap(dev->fd, dev->cow.bitmap, + dev->cow.bitmap_offset, + dev->cow.bitmap_len); + if(err) goto error; + + flags = dev->openflags; + flags.w = 0; + err = open_ubd_file(dev->cow.file, &flags, NULL, NULL, NULL, + NULL, NULL); + if(err < 0) goto error; + dev->cow.fd = err; + } + return(0); + error: + os_close_file(dev->fd); + return(err); +} + +static int ubd_new_disk(int major, u64 size, int unit, + struct gendisk **disk_out) + +{ + struct gendisk *disk; + + disk = alloc_disk(1 << UBD_SHIFT); + if (!disk) + return -ENOMEM; + + disk->major = major; + disk->first_minor = unit << UBD_SHIFT; + disk->fops = &ubd_blops; + set_capacity(disk, size / 512); + sprintf(disk->disk_name, "ubd"); + sprintf(disk->devfs_name, "ubd/disc%d", unit); + + disk->private_data = &ubd_dev[unit]; + disk->queue = ubd_queue; + add_disk(disk); + + *disk_out = disk; + return 0; +} + +static int ubd_add(int n) +{ + struct ubd *dev = &ubd_dev[n]; + int err; + + if(dev->is_dir) + return(-EISDIR); + + if (!dev->file) + return(-ENODEV); + + if (ubd_open_dev(dev)) + return(-ENODEV); + + err = ubd_file_size(dev, &dev->size); + if(err) + return(err); + + err = ubd_new_disk(MAJOR_NR, dev->size, n, &ubd_gendisk[n]); + if(err) + return(err); + + if(fake_major) + ubd_new_disk(fake_major, dev->size, n, + &fake_gendisk[n]); + + /* perhaps this should also be under the "if (fake_major)" above */ + /* using the fake_disk->disk_name and also the fakehd_set name */ + if (fake_ide) + make_ide_entries(ubd_gendisk[n]->disk_name); + + ubd_close(dev); + return 0; +} + +static int ubd_config(char *str) +{ + int n, err; + + str = uml_strdup(str); + if(str == NULL){ + printk(KERN_ERR "ubd_config failed to strdup string\n"); + return(1); + } + err = ubd_setup_common(str, &n); + if(err){ + kfree(str); + return(-1); + } + if(n == -1) return(0); + + spin_lock(&ubd_lock); + err = ubd_add(n); + if(err) + ubd_dev[n].file = NULL; + spin_unlock(&ubd_lock); + + return(err); +} + +static int ubd_get_config(char *dev, char *str, int size, char **error_out) +{ + struct ubd *ubd; + char *end; + int major, n = 0; + + major = simple_strtoul(dev, &end, 0); + if((*end != '\0') || (end == dev)){ + *error_out = "ubd_get_config : didn't parse major number"; + return(-1); + } + + if((major >= MAX_DEV) || (major < 0)){ + *error_out = "ubd_get_config : major number out of range"; + return(-1); + } + + ubd = &ubd_dev[major]; + spin_lock(&ubd_lock); + + if(ubd->file == NULL){ + CONFIG_CHUNK(str, size, n, "", 1); + goto out; + } + + CONFIG_CHUNK(str, size, n, ubd->file, 0); + + if(ubd->cow.file != NULL){ + CONFIG_CHUNK(str, size, n, ",", 0); + CONFIG_CHUNK(str, size, n, ubd->cow.file, 1); + } + else CONFIG_CHUNK(str, size, n, "", 1); + + out: + spin_unlock(&ubd_lock); + return(n); +} + +static int ubd_remove(char *str) +{ + struct ubd *dev; + int n, err = -ENODEV; + + if(!isdigit(*str)) + return(err); /* it should be a number 0-7/a-h */ + + n = *str - '0'; + if(n >= MAX_DEV) + return(err); + + dev = &ubd_dev[n]; + if(dev->count > 0) + return(-EBUSY); /* you cannot remove a open disk */ + + err = 0; + spin_lock(&ubd_lock); + + if(ubd_gendisk[n] == NULL) + goto out; + + del_gendisk(ubd_gendisk[n]); + put_disk(ubd_gendisk[n]); + ubd_gendisk[n] = NULL; + + if(fake_gendisk[n] != NULL){ + del_gendisk(fake_gendisk[n]); + put_disk(fake_gendisk[n]); + fake_gendisk[n] = NULL; + } + + *dev = ((struct ubd) DEFAULT_UBD); + err = 0; + out: + spin_unlock(&ubd_lock); + return(err); +} + +static struct mc_device ubd_mc = { + .name = "ubd", + .config = ubd_config, + .get_config = ubd_get_config, + .remove = ubd_remove, +}; + +static int ubd_mc_init(void) +{ + mconsole_register_dev(&ubd_mc); + return 0; +} + +__initcall(ubd_mc_init); + +int ubd_init(void) +{ + int i; + + devfs_mk_dir("ubd"); + if (register_blkdev(MAJOR_NR, "ubd")) + return -1; + + ubd_queue = blk_init_queue(do_ubd_request, &ubd_io_lock); + if (!ubd_queue) { + unregister_blkdev(MAJOR_NR, "ubd"); + return -1; + } + + elevator_init(ubd_queue, &elevator_noop); + + if (fake_major != 0) { + char name[sizeof("ubd_nnn\0")]; + + snprintf(name, sizeof(name), "ubd_%d", fake_major); + devfs_mk_dir(name); + if (register_blkdev(fake_major, "ubd")) + return -1; + } + for (i = 0; i < MAX_DEV; i++) + ubd_add(i); + return 0; +} + +late_initcall(ubd_init); + +int ubd_driver_init(void){ + unsigned long stack; + int err; + + if(global_openflags.s){ + printk(KERN_INFO "ubd : Synchronous mode\n"); + return(0); + } + stack = alloc_stack(0, 0); + io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), + &thread_fd); + if(io_pid < 0){ + printk(KERN_ERR + "ubd : Failed to start I/O thread (errno = %d) - " + "falling back to synchronous I/O\n", -io_pid); + return(0); + } + err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, + SA_INTERRUPT, "ubd", ubd_dev); + if(err != 0) printk(KERN_ERR + "um_request_irq failed - errno = %d\n", -err); + return(err); +} + +device_initcall(ubd_driver_init); + +static int ubd_open(struct inode *inode, struct file *filp) +{ + struct gendisk *disk = inode->i_bdev->bd_disk; + struct ubd *dev = disk->private_data; + int err = -EISDIR; + + if(dev->is_dir == 1) + goto out; + + err = 0; + if(dev->count == 0){ + dev->openflags = dev->boot_openflags; + + err = ubd_open_dev(dev); + if(err){ + printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", + disk->disk_name, dev->file, -err); + goto out; + } + } + dev->count++; + if((filp->f_mode & FMODE_WRITE) && !dev->openflags.w){ + if(--dev->count == 0) ubd_close(dev); + err = -EROFS; + } + out: + return(err); +} + +static int ubd_release(struct inode * inode, struct file * file) +{ + struct gendisk *disk = inode->i_bdev->bd_disk; + struct ubd *dev = disk->private_data; + + if(--dev->count == 0) + ubd_close(dev); + return(0); +} + +void cowify_req(struct io_thread_req *req, struct ubd *dev) +{ + int i, update_bitmap, sector = req->offset >> 9; + + if(req->length > (sizeof(req->sector_mask) * 8) << 9) + panic("Operation too long"); + if(req->op == UBD_READ) { + for(i = 0; i < req->length >> 9; i++){ + if(ubd_test_bit(sector + i, (unsigned char *) + dev->cow.bitmap)){ + ubd_set_bit(i, (unsigned char *) + &req->sector_mask); + } + } + } + else { + update_bitmap = 0; + for(i = 0; i < req->length >> 9; i++){ + ubd_set_bit(i, (unsigned char *) + &req->sector_mask); + if(!ubd_test_bit(sector + i, (unsigned char *) + dev->cow.bitmap)) + update_bitmap = 1; + ubd_set_bit(sector + i, (unsigned char *) + dev->cow.bitmap); + } + if(update_bitmap){ + req->cow_offset = sector / (sizeof(unsigned long) * 8); + req->bitmap_words[0] = + dev->cow.bitmap[req->cow_offset]; + req->bitmap_words[1] = + dev->cow.bitmap[req->cow_offset + 1]; + req->cow_offset *= sizeof(unsigned long); + req->cow_offset += dev->cow.bitmap_offset; + } + } +} + +static int prepare_request(struct request *req, struct io_thread_req *io_req) +{ + struct gendisk *disk = req->rq_disk; + struct ubd *dev = disk->private_data; + __u64 block; + int nsect; + + if(req->rq_status == RQ_INACTIVE) return(1); + + if(dev->is_dir){ + strcpy(req->buffer, "HOSTFS:"); + strcat(req->buffer, dev->file); + spin_lock(&ubd_io_lock); + end_request(req, 1); + spin_unlock(&ubd_io_lock); + return(1); + } + + if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ + printk("Write attempted on readonly ubd device %s\n", + disk->disk_name); + spin_lock(&ubd_io_lock); + end_request(req, 0); + spin_unlock(&ubd_io_lock); + return(1); + } + + block = req->sector; + nsect = req->current_nr_sectors; + + io_req->op = rq_data_dir(req) == READ ? UBD_READ : UBD_WRITE; + io_req->fds[0] = (dev->cow.file != NULL) ? dev->cow.fd : dev->fd; + io_req->fds[1] = dev->fd; + io_req->offsets[0] = 0; + io_req->offsets[1] = dev->cow.data_offset; + io_req->offset = ((__u64) block) << 9; + io_req->length = nsect << 9; + io_req->buffer = req->buffer; + io_req->sectorsize = 1 << 9; + io_req->sector_mask = 0; + io_req->cow_offset = -1; + io_req->error = 0; + + if(dev->cow.file != NULL) cowify_req(io_req, dev); + return(0); +} + +static void do_ubd_request(request_queue_t *q) +{ + struct io_thread_req io_req; + struct request *req; + int err, n; + + if(thread_fd == -1){ + while(!list_empty(&q->queue_head)){ + req = elv_next_request(q); + err = prepare_request(req, &io_req); + if(!err){ + do_io(&io_req); + ubd_finish(req, io_req.error); + } + } + } + else { + if(do_ubd || list_empty(&q->queue_head)) return; + req = elv_next_request(q); + err = prepare_request(req, &io_req); + if(!err){ + do_ubd = ubd_handler; + n = write_ubd_fs(thread_fd, (char *) &io_req, + sizeof(io_req)); + if(n != sizeof(io_req)) + printk("write to io thread failed, " + "errno = %d\n", -n); + } + } +} + +static int ubd_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg) +{ + struct hd_geometry *loc = (struct hd_geometry *) arg; + struct ubd *dev = inode->i_bdev->bd_disk->private_data; + int err; + struct hd_driveid ubd_id = { + .cyls = 0, + .heads = 128, + .sectors = 32, + }; + + switch (cmd) { + struct hd_geometry g; + struct cdrom_volctrl volume; + case HDIO_GETGEO: + if(!loc) return(-EINVAL); + g.heads = 128; + g.sectors = 32; + g.cylinders = dev->size / (128 * 32 * 512); + g.start = 2; + return(copy_to_user(loc, &g, sizeof(g)) ? -EFAULT : 0); + + case HDIO_SET_UNMASKINTR: + if(!capable(CAP_SYS_ADMIN)) return(-EACCES); + if((arg > 1) || (inode->i_bdev->bd_contains != inode->i_bdev)) + return(-EINVAL); + return(0); + + case HDIO_GET_UNMASKINTR: + if(!arg) return(-EINVAL); + err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); + if(err) + return(err); + return(0); + + case HDIO_GET_MULTCOUNT: + if(!arg) return(-EINVAL); + err = verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); + if(err) + return(err); + return(0); + + case HDIO_SET_MULTCOUNT: + if(!capable(CAP_SYS_ADMIN)) return(-EACCES); + if(inode->i_bdev->bd_contains != inode->i_bdev) + return(-EINVAL); + return(0); + + case HDIO_GET_IDENTITY: + ubd_id.cyls = dev->size / (128 * 32 * 512); + if(copy_to_user((char *) arg, (char *) &ubd_id, + sizeof(ubd_id))) + return(-EFAULT); + return(0); + + case CDROMVOLREAD: + if(copy_from_user(&volume, (char *) arg, sizeof(volume))) + return(-EFAULT); + volume.channel0 = 255; + volume.channel1 = 255; + volume.channel2 = 255; + volume.channel3 = 255; + if(copy_to_user((char *) arg, &volume, sizeof(volume))) + return(-EFAULT); + return(0); + } + return(-EINVAL); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c --- a/arch/um/drivers/ubd_user.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/ubd_user.c Fri Oct 31 14:10:53 2003 @@ -27,139 +27,20 @@ #include #include -#if __BYTE_ORDER == __BIG_ENDIAN -# define ntohll(x) (x) -# define htonll(x) (x) -#elif __BYTE_ORDER == __LITTLE_ENDIAN -# define ntohll(x) bswap_64(x) -# define htonll(x) bswap_64(x) -#else -#error "__BYTE_ORDER not defined" -#endif - -#define PATH_LEN_V1 256 - -struct cow_header_v1 { - int magic; - int version; - char backing_file[PATH_LEN_V1]; - time_t mtime; - __u64 size; - int sectorsize; -}; - -#define PATH_LEN_V2 MAXPATHLEN - -struct cow_header_v2 { - unsigned long magic; - unsigned long version; - char backing_file[PATH_LEN_V2]; - time_t mtime; - __u64 size; - int sectorsize; -}; - -union cow_header { - struct cow_header_v1 v1; - struct cow_header_v2 v2; -}; - -#define COW_MAGIC 0x4f4f4f4d /* MOOO */ -#define COW_VERSION 2 - -static void sizes(__u64 size, int sectorsize, int bitmap_offset, - unsigned long *bitmap_len_out, int *data_offset_out) -{ - *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize); - - *data_offset_out = bitmap_offset + *bitmap_len_out; - *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize; - *data_offset_out *= sectorsize; -} - -static int read_cow_header(int fd, int *magic_out, char **backing_file_out, - time_t *mtime_out, __u64 *size_out, - int *sectorsize_out, int *bitmap_offset_out) -{ - union cow_header *header; - char *file; - int err, n; - unsigned long version, magic; - - header = um_kmalloc(sizeof(*header)); - if(header == NULL){ - printk("read_cow_header - Failed to allocate header\n"); - return(-ENOMEM); - } - err = -EINVAL; - n = read(fd, header, sizeof(*header)); - if(n < offsetof(typeof(header->v1), backing_file)){ - printk("read_cow_header - short header\n"); - goto out; - } - - magic = header->v1.magic; - if(magic == COW_MAGIC) { - version = header->v1.version; - } - else if(magic == ntohl(COW_MAGIC)){ - version = ntohl(header->v1.version); - } - else goto out; - - *magic_out = COW_MAGIC; - - if(version == 1){ - if(n < sizeof(header->v1)){ - printk("read_cow_header - failed to read V1 header\n"); - goto out; - } - *mtime_out = header->v1.mtime; - *size_out = header->v1.size; - *sectorsize_out = header->v1.sectorsize; - *bitmap_offset_out = sizeof(header->v1); - file = header->v1.backing_file; - } - else if(version == 2){ - if(n < sizeof(header->v2)){ - printk("read_cow_header - failed to read V2 header\n"); - goto out; - } - *mtime_out = ntohl(header->v2.mtime); - *size_out = ntohll(header->v2.size); - *sectorsize_out = ntohl(header->v2.sectorsize); - *bitmap_offset_out = sizeof(header->v2); - file = header->v2.backing_file; - } - else { - printk("read_cow_header - invalid COW version\n"); - goto out; - } - err = -ENOMEM; - *backing_file_out = uml_strdup(file); - if(*backing_file_out == NULL){ - printk("read_cow_header - failed to allocate backing file\n"); - goto out; - } - err = 0; - out: - kfree(header); - return(err); -} static int same_backing_files(char *from_cmdline, char *from_cow, char *cow) { - struct stat buf1, buf2; + struct stat64 buf1, buf2; if(from_cmdline == NULL) return(1); if(!strcmp(from_cmdline, from_cow)) return(1); - if(stat(from_cmdline, &buf1) < 0){ + if(stat64(from_cmdline, &buf1) < 0){ printk("Couldn't stat '%s', errno = %d\n", from_cmdline, errno); return(1); } - if(stat(from_cow, &buf2) < 0){ + if(stat64(from_cow, &buf2) < 0){ printk("Couldn't stat '%s', errno = %d\n", from_cow, errno); return(1); } @@ -215,121 +96,16 @@ return(0); } -static int absolutize(char *to, int size, char *from) -{ - char save_cwd[256], *slash; - int remaining; - - if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) { - printk("absolutize : unable to get cwd - errno = %d\n", errno); - return(-1); - } - slash = strrchr(from, '/'); - if(slash != NULL){ - *slash = '\0'; - if(chdir(from)){ - *slash = '/'; - printk("absolutize : Can't cd to '%s' - errno = %d\n", - from, errno); - return(-1); - } - *slash = '/'; - if(getcwd(to, size) == NULL){ - printk("absolutize : unable to get cwd of '%s' - " - "errno = %d\n", from, errno); - return(-1); - } - remaining = size - strlen(to); - if(strlen(slash) + 1 > remaining){ - printk("absolutize : unable to fit '%s' into %d " - "chars\n", from, size); - return(-1); - } - strcat(to, slash); - } - else { - if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){ - printk("absolutize : unable to fit '%s' into %d " - "chars\n", from, size); - return(-1); - } - strcpy(to, save_cwd); - strcat(to, "/"); - strcat(to, from); - } - chdir(save_cwd); - return(0); -} - -static int write_cow_header(char *cow_file, int fd, char *backing_file, - int sectorsize, long long *size) -{ - struct cow_header_v2 *header; - struct stat64 buf; - int err; +#define read_cow_header(file_reader, fd, magic, backing_file, mtime, size, sectorsize, bitmap_offset_out) (0) +#define write_cow_header(file, fd, backing_file_out, sectorsize, size) (0) - err = os_seek_file(fd, 0); - if(err != 0){ - printk("write_cow_header - lseek failed, errno = %d\n", errno); - return(-errno); - } +#define cow_sizes(size, sectorsize, bitmap_offset_out, bitmap_len_out, data_offset_out) do {;} while(0) - err = -ENOMEM; - header = um_kmalloc(sizeof(*header)); - if(header == NULL){ - printk("Failed to allocate COW V2 header\n"); - goto out; - } - header->magic = htonl(COW_MAGIC); - header->version = htonl(COW_VERSION); +#define init_cow_file(fd, cow_file, backing_file, sectorsize, bitmap_offset_out, bitmap_len_out, data_offset_out) (0) - err = -EINVAL; - if(strlen(backing_file) > sizeof(header->backing_file) - 1){ - printk("Backing file name \"%s\" is too long - names are " - "limited to %d characters\n", backing_file, - sizeof(header->backing_file) - 1); - goto out_free; - } - - if(absolutize(header->backing_file, sizeof(header->backing_file), - backing_file)) - goto out_free; - - err = stat64(header->backing_file, &buf); - if(err < 0){ - printk("Stat of backing file '%s' failed, errno = %d\n", - header->backing_file, errno); - err = -errno; - goto out_free; - } - - err = os_file_size(header->backing_file, size); - if(err){ - printk("Couldn't get size of backing file '%s', errno = %d\n", - header->backing_file, -*size); - goto out_free; - } - - header->mtime = htonl(buf.st_mtime); - header->size = htonll(*size); - header->sectorsize = htonl(sectorsize); - - err = write(fd, header, sizeof(*header)); - if(err != sizeof(*header)){ - printk("Write of header to new COW file '%s' failed, " - "errno = %d\n", cow_file, errno); - goto out_free; - } - err = 0; - out_free: - kfree(header); - out: - return(err); -} - -int open_ubd_file(char *file, struct openflags *openflags, - char **backing_file_out, int *bitmap_offset_out, - unsigned long *bitmap_len_out, int *data_offset_out, +int open_ubd_file(char *file, struct openflags *openflags, + char **backing_file_out, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out, int *create_cow_out) { time_t mtime; @@ -346,10 +122,17 @@ if((fd = os_open_file(file, *openflags, mode)) < 0) return(fd); } + + err = os_lock_file(fd, openflags->w); + if(err){ + printk("Failed to lock '%s', errno = %d\n", file, -err); + goto error; + } + if(backing_file_out == NULL) return(fd); - err = read_cow_header(fd, &magic, &backing_file, &mtime, &size, - §orsize, bitmap_offset_out); + err = read_cow_header(file_reader, &fd, &magic, &backing_file, &mtime, + &size, §orsize, bitmap_offset_out); if(err && (*backing_file_out != NULL)){ printk("Failed to read COW header from COW file \"%s\", " "errno = %d\n", file, err); @@ -376,12 +159,12 @@ if(err) goto error; } - sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, - data_offset_out); + cow_sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, + data_offset_out); return(fd); error: - close(fd); + os_close_file(fd); return(err); } @@ -389,10 +172,7 @@ int sectorsize, int *bitmap_offset_out, unsigned long *bitmap_len_out, int *data_offset_out) { - __u64 blocks; - long zero; - int err, fd, i; - long long size; + int err, fd; flags.c = 1; fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL); @@ -403,29 +183,12 @@ goto out; } - err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size); - if(err) goto out_close; - - blocks = (size + sectorsize - 1) / sectorsize; - blocks = (blocks + sizeof(long) * 8 - 1) / (sizeof(long) * 8); - zero = 0; - for(i = 0; i < blocks; i++){ - err = write(fd, &zero, sizeof(zero)); - if(err != sizeof(zero)){ - printk("Write of bitmap to new COW file '%s' failed, " - "errno = %d\n", cow_file, errno); - goto out_close; - } - } - - sizes(size, sectorsize, sizeof(struct cow_header_v2), - bitmap_len_out, data_offset_out); - *bitmap_offset_out = sizeof(struct cow_header_v2); - - return(fd); - - out_close: - close(fd); + err = init_cow_file(fd, cow_file, backing_file, sectorsize, + bitmap_offset_out, bitmap_len_out, + data_offset_out); + if(!err) + return(fd); + os_close_file(fd); out: return(err); } @@ -446,14 +209,6 @@ n = write(fd, buffer, len); if(n < 0) return(-errno); else return(n); -} - -int ubd_is_dir(char *file) -{ - struct stat64 buf; - - if(stat64(file, &buf) < 0) return(0); - return(S_ISDIR(buf.st_mode)); } void do_io(struct io_thread_req *req) diff -Nru a/arch/um/drivers/ubd_user.c~uml-kill-cow.diff b/arch/um/drivers/ubd_user.c~uml-kill-cow.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/ubd_user.c~uml-kill-cow.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,370 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2001 Ridgerun,Inc (glonnon@ridgerun.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "asm/types.h" +#include "user_util.h" +#include "kern_util.h" +#include "user.h" +#include "ubd_user.h" +#include "os.h" +#include "cow.h" + +#include +#include + +static int same_backing_files(char *from_cmdline, char *from_cow, char *cow) +{ + struct stat64 buf1, buf2; + + if(from_cmdline == NULL) return(1); + if(!strcmp(from_cmdline, from_cow)) return(1); + + if(stat64(from_cmdline, &buf1) < 0){ + printk("Couldn't stat '%s', errno = %d\n", from_cmdline, + errno); + return(1); + } + if(stat64(from_cow, &buf2) < 0){ + printk("Couldn't stat '%s', errno = %d\n", from_cow, errno); + return(1); + } + if((buf1.st_dev == buf2.st_dev) && (buf1.st_ino == buf2.st_ino)) + return(1); + + printk("Backing file mismatch - \"%s\" requested,\n" + "\"%s\" specified in COW header of \"%s\"\n", + from_cmdline, from_cow, cow); + return(0); +} + +static int backing_file_mismatch(char *file, __u64 size, time_t mtime) +{ + struct stat64 buf; + long long actual; + int err; + + if(stat64(file, &buf) < 0){ + printk("Failed to stat backing file \"%s\", errno = %d\n", + file, errno); + return(-errno); + } + + err = os_file_size(file, &actual); + if(err){ + printk("Failed to get size of backing file \"%s\", " + "errno = %d\n", file, -err); + return(err); + } + + if(actual != size){ + printk("Size mismatch (%ld vs %ld) of COW header vs backing " + "file\n", size, actual); + return(-EINVAL); + } + if(buf.st_mtime != mtime){ + printk("mtime mismatch (%ld vs %ld) of COW header vs backing " + "file\n", mtime, buf.st_mtime); + return(-EINVAL); + } + return(0); +} + +int read_cow_bitmap(int fd, void *buf, int offset, int len) +{ + int err; + + err = os_seek_file(fd, offset); + if(err != 0) return(-errno); + err = read(fd, buf, len); + if(err < 0) return(-errno); + return(0); +} + +int open_ubd_file(char *file, struct openflags *openflags, + char **backing_file_out, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out, + int *create_cow_out) +{ + time_t mtime; + __u64 size; + char *backing_file; + int fd, err, sectorsize, magic, same, mode = 0644; + + if((fd = os_open_file(file, *openflags, mode)) < 0){ + if((fd == -ENOENT) && (create_cow_out != NULL)) + *create_cow_out = 1; + if(!openflags->w || + ((errno != EROFS) && (errno != EACCES))) return(-errno); + openflags->w = 0; + if((fd = os_open_file(file, *openflags, mode)) < 0) + return(fd); + } + + err = os_lock_file(fd, openflags->w); + if(err){ + printk("Failed to lock '%s', errno = %d\n", file, -err); + goto error; + } + + if(backing_file_out == NULL) return(fd); + + err = read_cow_header(file_reader, &fd, &magic, &backing_file, &mtime, + &size, §orsize, bitmap_offset_out); + if(err && (*backing_file_out != NULL)){ + printk("Failed to read COW header from COW file \"%s\", " + "errno = %d\n", file, err); + goto error; + } + if(err) return(fd); + + if(backing_file_out == NULL) return(fd); + + same = same_backing_files(*backing_file_out, backing_file, file); + + if(!same && !backing_file_mismatch(*backing_file_out, size, mtime)){ + printk("Switching backing file to '%s'\n", *backing_file_out); + err = write_cow_header(file, fd, *backing_file_out, + sectorsize, &size); + if(err){ + printk("Switch failed, errno = %d\n", err); + return(err); + } + } + else { + *backing_file_out = backing_file; + err = backing_file_mismatch(*backing_file_out, size, mtime); + if(err) goto error; + } + + cow_sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, + data_offset_out); + + return(fd); + error: + os_close_file(fd); + return(err); +} + +int create_cow_file(char *cow_file, char *backing_file, struct openflags flags, + int sectorsize, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out) +{ + int err, fd; + + flags.c = 1; + fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL); + if(fd < 0){ + err = fd; + printk("Open of COW file '%s' failed, errno = %d\n", cow_file, + -err); + goto out; + } + + err = init_cow_file(fd, cow_file, backing_file, sectorsize, + bitmap_offset_out, bitmap_len_out, + data_offset_out); + if(!err) + return(fd); + os_close_file(fd); + out: + return(err); +} + +int read_ubd_fs(int fd, void *buffer, int len) +{ + int n; + + n = read(fd, buffer, len); + if(n < 0) return(-errno); + else return(n); +} + +int write_ubd_fs(int fd, char *buffer, int len) +{ + int n; + + n = write(fd, buffer, len); + if(n < 0) return(-errno); + else return(n); +} + +void do_io(struct io_thread_req *req) +{ + char *buf; + unsigned long len; + int n, nsectors, start, end, bit; + __u64 off; + + nsectors = req->length / req->sectorsize; + start = 0; + do { + bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); + end = start; + while((end < nsectors) && + (ubd_test_bit(end, (unsigned char *) + &req->sector_mask) == bit)) + end++; + + if(end != nsectors) + printk("end != nsectors\n"); + off = req->offset + req->offsets[bit] + + start * req->sectorsize; + len = (end - start) * req->sectorsize; + buf = &req->buffer[start * req->sectorsize]; + + if(os_seek_file(req->fds[bit], off) != 0){ + printk("do_io - lseek failed : errno = %d\n", errno); + req->error = 1; + return; + } + if(req->op == UBD_READ){ + n = 0; + do { + buf = &buf[n]; + len -= n; + n = read(req->fds[bit], buf, len); + if (n < 0) { + printk("do_io - read returned %d : " + "errno = %d fd = %d\n", n, + errno, req->fds[bit]); + req->error = 1; + return; + } + } while((n < len) && (n != 0)); + if (n < len) memset(&buf[n], 0, len - n); + } + else { + n = write(req->fds[bit], buf, len); + if(n != len){ + printk("do_io - write returned %d : " + "errno = %d fd = %d\n", n, + errno, req->fds[bit]); + req->error = 1; + return; + } + } + + start = end; + } while(start < nsectors); + + if(req->cow_offset != -1){ + if(os_seek_file(req->fds[1], req->cow_offset) != 0){ + printk("do_io - bitmap lseek failed : errno = %d\n", + errno); + req->error = 1; + return; + } + n = write(req->fds[1], &req->bitmap_words, + sizeof(req->bitmap_words)); + if(n != sizeof(req->bitmap_words)){ + printk("do_io - bitmap update returned %d : " + "errno = %d fd = %d\n", n, errno, req->fds[1]); + req->error = 1; + return; + } + } + req->error = 0; + return; +} + +/* Changed in start_io_thread, which is serialized by being called only + * from ubd_init, which is an initcall. + */ +int kernel_fd = -1; + +/* Only changed by the io thread */ +int io_count = 0; + +int io_thread(void *arg) +{ + struct io_thread_req req; + int n; + + signal(SIGWINCH, SIG_IGN); + while(1){ + n = read(kernel_fd, &req, sizeof(req)); + if(n < 0) printk("io_thread - read returned %d, errno = %d\n", + n, errno); + else if(n < sizeof(req)){ + printk("io_thread - short read : length = %d\n", n); + continue; + } + io_count++; + do_io(&req); + n = write(kernel_fd, &req, sizeof(req)); + if(n != sizeof(req)) + printk("io_thread - write failed, errno = %d\n", + errno); + } +} + +int start_io_thread(unsigned long sp, int *fd_out) +{ + int pid, fds[2], err; + + err = os_pipe(fds, 1, 1); + if(err){ + printk("start_io_thread - os_pipe failed, errno = %d\n", -err); + return(-1); + } + kernel_fd = fds[0]; + *fd_out = fds[1]; + + pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD, + NULL); + if(pid < 0){ + printk("start_io_thread - clone failed : errno = %d\n", errno); + return(-errno); + } + return(pid); +} + +#ifdef notdef +int start_io_thread(unsigned long sp, int *fd_out) +{ + int pid; + + if((kernel_fd = get_pty()) < 0) return(-1); + raw(kernel_fd, 0); + if((*fd_out = open(ptsname(kernel_fd), O_RDWR)) < 0){ + printk("Couldn't open tty for IO\n"); + return(-1); + } + + pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD, + NULL); + if(pid < 0){ + printk("start_io_thread - clone failed : errno = %d\n", errno); + return(-errno); + } + return(pid); +} +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/ubd_user.c~uml-summa.diff b/arch/um/drivers/ubd_user.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/ubd_user.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,621 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2001 Ridgerun,Inc (glonnon@ridgerun.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "asm/types.h" +#include "user_util.h" +#include "kern_util.h" +#include "user.h" +#include "ubd_user.h" +#include "os.h" + +#include +#include +#if __BYTE_ORDER == __BIG_ENDIAN +# define ntohll(x) (x) +# define htonll(x) (x) +#elif __BYTE_ORDER == __LITTLE_ENDIAN +# define ntohll(x) bswap_64(x) +# define htonll(x) bswap_64(x) +#else +#error "__BYTE_ORDER not defined" +#endif + +#define PATH_LEN_V1 256 + +struct cow_header_v1 { + int magic; + int version; + char backing_file[PATH_LEN_V1]; + time_t mtime; + __u64 size; + int sectorsize; +}; + +#define PATH_LEN_V2 MAXPATHLEN + +struct cow_header_v2 { + unsigned long magic; + unsigned long version; + char backing_file[PATH_LEN_V2]; + time_t mtime; + __u64 size; + int sectorsize; +}; + +union cow_header { + struct cow_header_v1 v1; + struct cow_header_v2 v2; +}; + +#define COW_MAGIC 0x4f4f4f4d /* MOOO */ +#define COW_VERSION 2 + +static void sizes(__u64 size, int sectorsize, int bitmap_offset, + unsigned long *bitmap_len_out, int *data_offset_out) +{ + *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize); + + *data_offset_out = bitmap_offset + *bitmap_len_out; + *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize; + *data_offset_out *= sectorsize; +} + +static int read_cow_header(int fd, int *magic_out, char **backing_file_out, + time_t *mtime_out, __u64 *size_out, + int *sectorsize_out, int *bitmap_offset_out) +{ + union cow_header *header; + char *file; + int err, n; + unsigned long version, magic; + + header = um_kmalloc(sizeof(*header)); + if(header == NULL){ + printk("read_cow_header - Failed to allocate header\n"); + return(-ENOMEM); + } + err = -EINVAL; + n = read(fd, header, sizeof(*header)); + if(n < offsetof(typeof(header->v1), backing_file)){ + printk("read_cow_header - short header\n"); + goto out; + } + + magic = header->v1.magic; + if(magic == COW_MAGIC) { + version = header->v1.version; + } + else if(magic == ntohl(COW_MAGIC)){ + version = ntohl(header->v1.version); + } + else goto out; + + *magic_out = COW_MAGIC; + + if(version == 1){ + if(n < sizeof(header->v1)){ + printk("read_cow_header - failed to read V1 header\n"); + goto out; + } + *mtime_out = header->v1.mtime; + *size_out = header->v1.size; + *sectorsize_out = header->v1.sectorsize; + *bitmap_offset_out = sizeof(header->v1); + file = header->v1.backing_file; + } + else if(version == 2){ + if(n < sizeof(header->v2)){ + printk("read_cow_header - failed to read V2 header\n"); + goto out; + } + *mtime_out = ntohl(header->v2.mtime); + *size_out = ntohll(header->v2.size); + *sectorsize_out = ntohl(header->v2.sectorsize); + *bitmap_offset_out = sizeof(header->v2); + file = header->v2.backing_file; + } + else { + printk("read_cow_header - invalid COW version\n"); + goto out; + } + err = -ENOMEM; + *backing_file_out = uml_strdup(file); + if(*backing_file_out == NULL){ + printk("read_cow_header - failed to allocate backing file\n"); + goto out; + } + err = 0; + out: + kfree(header); + return(err); +} + +static int same_backing_files(char *from_cmdline, char *from_cow, char *cow) +{ + struct stat buf1, buf2; + + if(from_cmdline == NULL) return(1); + if(!strcmp(from_cmdline, from_cow)) return(1); + + if(stat(from_cmdline, &buf1) < 0){ + printk("Couldn't stat '%s', errno = %d\n", from_cmdline, + errno); + return(1); + } + if(stat(from_cow, &buf2) < 0){ + printk("Couldn't stat '%s', errno = %d\n", from_cow, errno); + return(1); + } + if((buf1.st_dev == buf2.st_dev) && (buf1.st_ino == buf2.st_ino)) + return(1); + + printk("Backing file mismatch - \"%s\" requested,\n" + "\"%s\" specified in COW header of \"%s\"\n", + from_cmdline, from_cow, cow); + return(0); +} + +static int backing_file_mismatch(char *file, __u64 size, time_t mtime) +{ + struct stat64 buf; + long long actual; + int err; + + if(stat64(file, &buf) < 0){ + printk("Failed to stat backing file \"%s\", errno = %d\n", + file, errno); + return(-errno); + } + + err = os_file_size(file, &actual); + if(err){ + printk("Failed to get size of backing file \"%s\", " + "errno = %d\n", file, -err); + return(err); + } + + if(actual != size){ + printk("Size mismatch (%ld vs %ld) of COW header vs backing " + "file\n", size, actual); + return(-EINVAL); + } + if(buf.st_mtime != mtime){ + printk("mtime mismatch (%ld vs %ld) of COW header vs backing " + "file\n", mtime, buf.st_mtime); + return(-EINVAL); + } + return(0); +} + +int read_cow_bitmap(int fd, void *buf, int offset, int len) +{ + int err; + + err = os_seek_file(fd, offset); + if(err != 0) return(-errno); + err = read(fd, buf, len); + if(err < 0) return(-errno); + return(0); +} + +static int absolutize(char *to, int size, char *from) +{ + char save_cwd[256], *slash; + int remaining; + + if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) { + printk("absolutize : unable to get cwd - errno = %d\n", errno); + return(-1); + } + slash = strrchr(from, '/'); + if(slash != NULL){ + *slash = '\0'; + if(chdir(from)){ + *slash = '/'; + printk("absolutize : Can't cd to '%s' - errno = %d\n", + from, errno); + return(-1); + } + *slash = '/'; + if(getcwd(to, size) == NULL){ + printk("absolutize : unable to get cwd of '%s' - " + "errno = %d\n", from, errno); + return(-1); + } + remaining = size - strlen(to); + if(strlen(slash) + 1 > remaining){ + printk("absolutize : unable to fit '%s' into %d " + "chars\n", from, size); + return(-1); + } + strcat(to, slash); + } + else { + if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){ + printk("absolutize : unable to fit '%s' into %d " + "chars\n", from, size); + return(-1); + } + strcpy(to, save_cwd); + strcat(to, "/"); + strcat(to, from); + } + chdir(save_cwd); + return(0); +} + +static int write_cow_header(char *cow_file, int fd, char *backing_file, + int sectorsize, long long *size) +{ + struct cow_header_v2 *header; + struct stat64 buf; + int err; + + err = os_seek_file(fd, 0); + if(err != 0){ + printk("write_cow_header - lseek failed, errno = %d\n", errno); + return(-errno); + } + + err = -ENOMEM; + header = um_kmalloc(sizeof(*header)); + if(header == NULL){ + printk("Failed to allocate COW V2 header\n"); + goto out; + } + header->magic = htonl(COW_MAGIC); + header->version = htonl(COW_VERSION); + + err = -EINVAL; + if(strlen(backing_file) > sizeof(header->backing_file) - 1){ + printk("Backing file name \"%s\" is too long - names are " + "limited to %d characters\n", backing_file, + sizeof(header->backing_file) - 1); + goto out_free; + } + + if(absolutize(header->backing_file, sizeof(header->backing_file), + backing_file)) + goto out_free; + + err = stat64(header->backing_file, &buf); + if(err < 0){ + printk("Stat of backing file '%s' failed, errno = %d\n", + header->backing_file, errno); + err = -errno; + goto out_free; + } + + err = os_file_size(header->backing_file, size); + if(err){ + printk("Couldn't get size of backing file '%s', errno = %d\n", + header->backing_file, -*size); + goto out_free; + } + + header->mtime = htonl(buf.st_mtime); + header->size = htonll(*size); + header->sectorsize = htonl(sectorsize); + + err = write(fd, header, sizeof(*header)); + if(err != sizeof(*header)){ + printk("Write of header to new COW file '%s' failed, " + "errno = %d\n", cow_file, errno); + goto out_free; + } + err = 0; + out_free: + kfree(header); + out: + return(err); +} + +int open_ubd_file(char *file, struct openflags *openflags, + char **backing_file_out, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out, + int *create_cow_out) +{ + time_t mtime; + __u64 size; + char *backing_file; + int fd, err, sectorsize, magic, same, mode = 0644; + + if((fd = os_open_file(file, *openflags, mode)) < 0){ + if((fd == -ENOENT) && (create_cow_out != NULL)) + *create_cow_out = 1; + if(!openflags->w || + ((errno != EROFS) && (errno != EACCES))) return(-errno); + openflags->w = 0; + if((fd = os_open_file(file, *openflags, mode)) < 0) + return(fd); + } + if(backing_file_out == NULL) return(fd); + + err = read_cow_header(fd, &magic, &backing_file, &mtime, &size, + §orsize, bitmap_offset_out); + if(err && (*backing_file_out != NULL)){ + printk("Failed to read COW header from COW file \"%s\", " + "errno = %d\n", file, err); + goto error; + } + if(err) return(fd); + + if(backing_file_out == NULL) return(fd); + + same = same_backing_files(*backing_file_out, backing_file, file); + + if(!same && !backing_file_mismatch(*backing_file_out, size, mtime)){ + printk("Switching backing file to '%s'\n", *backing_file_out); + err = write_cow_header(file, fd, *backing_file_out, + sectorsize, &size); + if(err){ + printk("Switch failed, errno = %d\n", err); + return(err); + } + } + else { + *backing_file_out = backing_file; + err = backing_file_mismatch(*backing_file_out, size, mtime); + if(err) goto error; + } + + sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, + data_offset_out); + + return(fd); + error: + close(fd); + return(err); +} + +int create_cow_file(char *cow_file, char *backing_file, struct openflags flags, + int sectorsize, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out) +{ + __u64 blocks; + long zero; + int err, fd, i; + long long size; + + flags.c = 1; + fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL); + if(fd < 0){ + err = fd; + printk("Open of COW file '%s' failed, errno = %d\n", cow_file, + -err); + goto out; + } + + err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size); + if(err) goto out_close; + + blocks = (size + sectorsize - 1) / sectorsize; + blocks = (blocks + sizeof(long) * 8 - 1) / (sizeof(long) * 8); + zero = 0; + for(i = 0; i < blocks; i++){ + err = write(fd, &zero, sizeof(zero)); + if(err != sizeof(zero)){ + printk("Write of bitmap to new COW file '%s' failed, " + "errno = %d\n", cow_file, errno); + goto out_close; + } + } + + sizes(size, sectorsize, sizeof(struct cow_header_v2), + bitmap_len_out, data_offset_out); + *bitmap_offset_out = sizeof(struct cow_header_v2); + + return(fd); + + out_close: + close(fd); + out: + return(err); +} + +int read_ubd_fs(int fd, void *buffer, int len) +{ + int n; + + n = read(fd, buffer, len); + if(n < 0) return(-errno); + else return(n); +} + +int write_ubd_fs(int fd, char *buffer, int len) +{ + int n; + + n = write(fd, buffer, len); + if(n < 0) return(-errno); + else return(n); +} + +int ubd_is_dir(char *file) +{ + struct stat64 buf; + + if(stat64(file, &buf) < 0) return(0); + return(S_ISDIR(buf.st_mode)); +} + +void do_io(struct io_thread_req *req) +{ + char *buf; + unsigned long len; + int n, nsectors, start, end, bit; + __u64 off; + + nsectors = req->length / req->sectorsize; + start = 0; + do { + bit = ubd_test_bit(start, (unsigned char *) &req->sector_mask); + end = start; + while((end < nsectors) && + (ubd_test_bit(end, (unsigned char *) + &req->sector_mask) == bit)) + end++; + + if(end != nsectors) + printk("end != nsectors\n"); + off = req->offset + req->offsets[bit] + + start * req->sectorsize; + len = (end - start) * req->sectorsize; + buf = &req->buffer[start * req->sectorsize]; + + if(os_seek_file(req->fds[bit], off) != 0){ + printk("do_io - lseek failed : errno = %d\n", errno); + req->error = 1; + return; + } + if(req->op == UBD_READ){ + n = 0; + do { + buf = &buf[n]; + len -= n; + n = read(req->fds[bit], buf, len); + if (n < 0) { + printk("do_io - read returned %d : " + "errno = %d fd = %d\n", n, + errno, req->fds[bit]); + req->error = 1; + return; + } + } while((n < len) && (n != 0)); + if (n < len) memset(&buf[n], 0, len - n); + } + else { + n = write(req->fds[bit], buf, len); + if(n != len){ + printk("do_io - write returned %d : " + "errno = %d fd = %d\n", n, + errno, req->fds[bit]); + req->error = 1; + return; + } + } + + start = end; + } while(start < nsectors); + + if(req->cow_offset != -1){ + if(os_seek_file(req->fds[1], req->cow_offset) != 0){ + printk("do_io - bitmap lseek failed : errno = %d\n", + errno); + req->error = 1; + return; + } + n = write(req->fds[1], &req->bitmap_words, + sizeof(req->bitmap_words)); + if(n != sizeof(req->bitmap_words)){ + printk("do_io - bitmap update returned %d : " + "errno = %d fd = %d\n", n, errno, req->fds[1]); + req->error = 1; + return; + } + } + req->error = 0; + return; +} + +/* Changed in start_io_thread, which is serialized by being called only + * from ubd_init, which is an initcall. + */ +int kernel_fd = -1; + +/* Only changed by the io thread */ +int io_count = 0; + +int io_thread(void *arg) +{ + struct io_thread_req req; + int n; + + signal(SIGWINCH, SIG_IGN); + while(1){ + n = read(kernel_fd, &req, sizeof(req)); + if(n < 0) printk("io_thread - read returned %d, errno = %d\n", + n, errno); + else if(n < sizeof(req)){ + printk("io_thread - short read : length = %d\n", n); + continue; + } + io_count++; + do_io(&req); + n = write(kernel_fd, &req, sizeof(req)); + if(n != sizeof(req)) + printk("io_thread - write failed, errno = %d\n", + errno); + } +} + +int start_io_thread(unsigned long sp, int *fd_out) +{ + int pid, fds[2], err; + + err = os_pipe(fds, 1, 1); + if(err){ + printk("start_io_thread - os_pipe failed, errno = %d\n", -err); + return(-1); + } + kernel_fd = fds[0]; + *fd_out = fds[1]; + + pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD, + NULL); + if(pid < 0){ + printk("start_io_thread - clone failed : errno = %d\n", errno); + return(-errno); + } + return(pid); +} + +#ifdef notdef +int start_io_thread(unsigned long sp, int *fd_out) +{ + int pid; + + if((kernel_fd = get_pty()) < 0) return(-1); + raw(kernel_fd, 0); + if((*fd_out = open(ptsname(kernel_fd), O_RDWR)) < 0){ + printk("Couldn't open tty for IO\n"); + return(-1); + } + + pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM | SIGCHLD, + NULL); + if(pid < 0){ + printk("start_io_thread - clone failed : errno = %d\n", errno); + return(-errno); + } + return(pid); +} +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c --- a/arch/um/drivers/xterm.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/drivers/xterm.c Fri Oct 31 14:10:53 2003 @@ -108,7 +108,7 @@ } close(fd); - fd = create_unix_socket(file, sizeof(file)); + fd = create_unix_socket(file, sizeof(file), 1); if(fd < 0){ printk("xterm_open : create_unix_socket failed, errno = %d\n", -fd); diff -Nru a/arch/um/drivers/xterm.c~uml-summa.diff b/arch/um/drivers/xterm.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/xterm.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kern_util.h" +#include "chan_user.h" +#include "helper.h" +#include "user_util.h" +#include "user.h" +#include "os.h" +#include "xterm.h" + +struct xterm_chan { + int pid; + int helper_pid; + char *title; + int device; + int raw; + struct termios tt; + unsigned long stack; + int direct_rcv; +}; + +void *xterm_init(char *str, int device, struct chan_opts *opts) +{ + struct xterm_chan *data; + + if((data = malloc(sizeof(*data))) == NULL) return(NULL); + *data = ((struct xterm_chan) { .pid = -1, + .helper_pid = -1, + .device = device, + .title = opts->xterm_title, + .raw = opts->raw, + .stack = opts->tramp_stack, + .direct_rcv = !opts->in_kernel } ); + return(data); +} + +/* Only changed by xterm_setup, which is a setup */ +static char *terminal_emulator = "xterm"; +static char *title_switch = "-T"; +static char *exec_switch = "-e"; + +static int __init xterm_setup(char *line, int *add) +{ + *add = 0; + terminal_emulator = line; + + line = strchr(line, ','); + if(line == NULL) return(0); + *line++ = '\0'; + if(*line) title_switch = line; + + line = strchr(line, ','); + if(line == NULL) return(0); + *line++ = '\0'; + if(*line) exec_switch = line; + + return(0); +} + +__uml_setup("xterm=", xterm_setup, +"xterm=,,<exec switch>\n" +" Specifies an alternate terminal emulator to use for the debugger,\n" +" consoles, and serial lines when they are attached to the xterm channel.\n" +" The values are the terminal emulator binary, the switch it uses to set\n" +" its title, and the switch it uses to execute a subprocess,\n" +" respectively. The title switch must have the form '<switch> title',\n" +" not '<switch>=title'. Similarly, the exec switch must have the form\n" +" '<switch> command arg1 arg2 ...'.\n" +" The default values are 'xterm=xterm,-T,-e'. Values for gnome-terminal\n" +" are 'xterm=gnome-terminal,-t,-x'.\n\n" +); + +int xterm_open(int input, int output, int primary, void *d, char **dev_out) +{ + struct xterm_chan *data = d; + unsigned long stack; + int pid, fd, new, err; + char title[256], file[] = "/tmp/xterm-pipeXXXXXX"; + char *argv[] = { terminal_emulator, title_switch, title, exec_switch, + "/usr/lib/uml/port-helper", "-uml-socket", + file, NULL }; + + if(access(argv[4], X_OK)) + argv[4] = "port-helper"; + + fd = mkstemp(file); + if(fd < 0){ + printk("xterm_open : mkstemp failed, errno = %d\n", errno); + return(-errno); + } + + if(unlink(file)){ + printk("xterm_open : unlink failed, errno = %d\n", errno); + return(-errno); + } + close(fd); + + fd = create_unix_socket(file, sizeof(file)); + if(fd < 0){ + printk("xterm_open : create_unix_socket failed, errno = %d\n", + -fd); + return(-fd); + } + + sprintf(title, data->title, data->device); + stack = data->stack; + pid = run_helper(NULL, NULL, argv, &stack); + if(pid < 0){ + printk("xterm_open : run_helper failed, errno = %d\n", -pid); + return(pid); + } + + if(data->stack == 0) free_stack(stack, 0); + + if(data->direct_rcv) + new = os_rcv_fd(fd, &data->helper_pid); + else { + if((err = os_set_fd_block(fd, 0)) != 0){ + printk("xterm_open : failed to set descriptor " + "non-blocking, errno = %d\n", err); + return(err); + } + new = xterm_fd(fd, &data->helper_pid); + } + if(new < 0){ + printk("xterm_open : os_rcv_fd failed, errno = %d\n", -new); + goto out; + } + + tcgetattr(new, &data->tt); + if(data->raw) raw(new, 0); + + data->pid = pid; + *dev_out = NULL; + out: + unlink(file); + return(new); +} + +void xterm_close(int fd, void *d) +{ + struct xterm_chan *data = d; + + if(data->pid != -1) + os_kill_process(data->pid, 1); + data->pid = -1; + if(data->helper_pid != -1) + os_kill_process(data->helper_pid, 0); + data->helper_pid = -1; + close(fd); +} + +void xterm_free(void *d) +{ + free(d); +} + +int xterm_console_write(int fd, const char *buf, int n, void *d) +{ + struct xterm_chan *data = d; + + return(generic_console_write(fd, buf, n, &data->tt)); +} + +struct chan_ops xterm_ops = { + .type = "xterm", + .init = xterm_init, + .open = xterm_open, + .close = xterm_close, + .read = generic_read, + .write = generic_write, + .console_write = xterm_console_write, + .window_size = generic_window_size, + .free = xterm_free, + .winch = 1, +}; + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/xterm_kern.c b/arch/um/drivers/xterm_kern.c --- a/arch/um/drivers/xterm_kern.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/drivers/xterm_kern.c Fri Oct 31 14:10:54 2003 @@ -5,7 +5,10 @@ #include "linux/errno.h" #include "linux/slab.h" +#include "linux/signal.h" +#include "linux/interrupt.h" #include "asm/semaphore.h" +#include "linux/signal.h" #include "asm/irq.h" #include "irq_user.h" #include "kern_util.h" @@ -19,17 +22,18 @@ int new_fd; }; -static void xterm_interrupt(int irq, void *data, struct pt_regs *regs) +static irqreturn_t xterm_interrupt(int irq, void *data, struct pt_regs *regs) { struct xterm_wait *xterm = data; int fd; fd = os_rcv_fd(xterm->fd, &xterm->pid); if(fd == -EAGAIN) - return; + return(IRQ_NONE); xterm->new_fd = fd; up(&xterm->sem); + return(IRQ_HANDLED); } int xterm_fd(int socket, int *pid_out) diff -Nru a/arch/um/drivers/xterm_kern.c~uml-kill-irq_kern.h.diff b/arch/um/drivers/xterm_kern.c~uml-kill-irq_kern.h.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/xterm_kern.c~uml-kill-irq_kern.h.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/errno.h" +#include "linux/slab.h" +#include "linux/signal.h" +#include "linux/interrupt.h" +#include "asm/semaphore.h" +#include "linux/signal.h" +#include "asm/irq.h" +#include "irq_user.h" +#include "irq_kern.h" +#include "kern_util.h" +#include "os.h" +#include "xterm.h" + +struct xterm_wait { + struct semaphore sem; + int fd; + int pid; + int new_fd; +}; + +static irqreturn_t xterm_interrupt(int irq, void *data, struct pt_regs *regs) +{ + struct xterm_wait *xterm = data; + int fd; + + fd = os_rcv_fd(xterm->fd, &xterm->pid); + if(fd == -EAGAIN) + return(IRQ_NONE); + + xterm->new_fd = fd; + up(&xterm->sem); + return(IRQ_HANDLED); +} + +int xterm_fd(int socket, int *pid_out) +{ + struct xterm_wait *data; + int err, ret; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if(data == NULL){ + printk(KERN_ERR "xterm_fd : failed to allocate xterm_wait\n"); + return(-ENOMEM); + } + *data = ((struct xterm_wait) + { .sem = __SEMAPHORE_INITIALIZER(data->sem, 0), + .fd = socket, + .pid = -1, + .new_fd = -1 }); + + err = um_request_irq(XTERM_IRQ, socket, IRQ_READ, xterm_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, + "xterm", data); + if(err){ + printk(KERN_ERR "xterm_fd : failed to get IRQ for xterm, " + "err = %d\n", err); + return(err); + } + down(&data->sem); + + free_irq(XTERM_IRQ, data); + + ret = data->new_fd; + *pid_out = data->pid; + kfree(data); + + return(ret); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/drivers/xterm_kern.c~uml-summa.diff b/arch/um/drivers/xterm_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/drivers/xterm_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/errno.h" +#include "linux/slab.h" +#include "asm/semaphore.h" +#include "asm/irq.h" +#include "irq_user.h" +#include "kern_util.h" +#include "os.h" +#include "xterm.h" + +struct xterm_wait { + struct semaphore sem; + int fd; + int pid; + int new_fd; +}; + +static void xterm_interrupt(int irq, void *data, struct pt_regs *regs) +{ + struct xterm_wait *xterm = data; + int fd; + + fd = os_rcv_fd(xterm->fd, &xterm->pid); + if(fd == -EAGAIN) + return; + + xterm->new_fd = fd; + up(&xterm->sem); +} + +int xterm_fd(int socket, int *pid_out) +{ + struct xterm_wait *data; + int err, ret; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if(data == NULL){ + printk(KERN_ERR "xterm_fd : failed to allocate xterm_wait\n"); + return(-ENOMEM); + } + *data = ((struct xterm_wait) + { .sem = __SEMAPHORE_INITIALIZER(data->sem, 0), + .fd = socket, + .pid = -1, + .new_fd = -1 }); + + err = um_request_irq(XTERM_IRQ, socket, IRQ_READ, xterm_interrupt, + SA_INTERRUPT | SA_SHIRQ | SA_SAMPLE_RANDOM, + "xterm", data); + if(err){ + printk(KERN_ERR "xterm_fd : failed to get IRQ for xterm, " + "err = %d\n", err); + return(err); + } + down(&data->sem); + + free_irq(XTERM_IRQ, data); + + ret = data->new_fd; + *pid_out = data->pid; + kfree(data); + + return(ret); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/dyn.lds.S b/arch/um/dyn.lds.S --- a/arch/um/dyn.lds.S Fri Oct 31 14:10:54 2003 +++ b/arch/um/dyn.lds.S Fri Oct 31 14:10:54 2003 @@ -15,7 +15,11 @@ . = ALIGN(4096); /* Init code and data */ _stext = .; __init_begin = .; - .text.init : { *(.text.init) } + .init.text : { + _sinittext = .; + *(.init.text) + _einittext = .; + } . = ALIGN(4096); @@ -67,7 +71,7 @@ #include "asm/common.lds.S" - .data.init : { *(.data.init) } + init.data : { *(.init.data) } /* Ensure the __preinit_array_start label is properly aligned. We could instead move the label definition inside the section, but diff -Nru a/arch/um/dyn.lds.S~uml-summa.diff b/arch/um/dyn.lds.S~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/dyn.lds.S~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,167 @@ +OUTPUT_FORMAT(ELF_FORMAT) +OUTPUT_ARCH(ELF_ARCH) +ENTRY(_start) +jiffies = jiffies_64; + +SEARCH_DIR("/usr/local/i686-pc-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +SECTIONS +{ + . = START + SIZEOF_HEADERS; + .interp : { *(.interp) } + . = ALIGN(4096); + __binary_start = .; + . = ALIGN(4096); /* Init code and data */ + _stext = .; + __init_begin = .; + .text.init : { *(.text.init) } + + . = ALIGN(4096); + + /* Read-only sections, merged into text segment: */ + .hash : { *(.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .rel.init : { *(.rel.init) } + .rela.init : { *(.rela.init) } + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } + .rel.fini : { *(.rel.fini) } + .rela.fini : { *(.rela.fini) } + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } + .rel.ctors : { *(.rel.ctors) } + .rela.ctors : { *(.rela.ctors) } + .rel.dtors : { *(.rel.dtors) } + .rela.dtors : { *(.rela.dtors) } + .rel.got : { *(.rel.got) } + .rela.got : { *(.rela.got) } + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } + .rel.plt : { *(.rel.plt) } + .rela.plt : { *(.rela.plt) } + .init : { + KEEP (*(.init)) + } =0x90909090 + .plt : { *(.plt) } + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } =0x90909090 + .fini : { + KEEP (*(.fini)) + } =0x90909090 + + .kstrtab : { *(.kstrtab) } + + #include "asm/common.lds.S" + + .data.init : { *(.data.init) } + + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(32 / 8); + .preinit_array : { *(.preinit_array) } + .init_array : { *(.init_array) } + .fini_array : { *(.fini_array) } + .data : { + . = ALIGN(KERNEL_STACK_SIZE); /* init_task */ + *(.data.init_task) + *(.data .data.* .gnu.linkonce.d.*) + SORT(CONSTRUCTORS) + } + .data1 : { *(.data1) } + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } + .eh_frame : { KEEP (*(.eh_frame)) } + .gcc_except_table : { *(.gcc_except_table) } + .dynamic : { *(.dynamic) } + .ctors : { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } + .dtors : { + KEEP (*crtbegin.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } + .jcr : { KEEP (*(.jcr)) } + .got : { *(.got.plt) *(.got) } + _edata = .; + PROVIDE (edata = .); + __bss_start = .; + .bss : { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* Align here to ensure that the .bss section occupies space up to + _end. Align after .bss to ensure correct alignment even if the + .bss section disappears because there are no input sections. */ + . = ALIGN(32 / 8); + . = ALIGN(32 / 8); + } + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } +} diff -Nru a/arch/um/include/kern_util.h b/arch/um/include/kern_util.h --- a/arch/um/include/kern_util.h Fri Oct 31 14:10:53 2003 +++ b/arch/um/include/kern_util.h Fri Oct 31 14:10:53 2003 @@ -63,10 +63,9 @@ extern void *syscall_sp(void *t); extern void syscall_trace(void); extern int hz(void); -extern void idle_timer(void); +extern void uml_idle_timer(void); extern unsigned int do_IRQ(int irq, union uml_pt_regs *regs); extern int external_pid(void *t); -extern int pid_to_processor_id(int pid); extern void boot_timer_handler(int sig); extern void interrupt_end(void); extern void initial_thread_cb(void (*proc)(void *), void *arg); @@ -90,9 +89,7 @@ extern char *uml_strdup(char *string); extern void unprotect_kernel_mem(void); extern void protect_kernel_mem(void); -extern void set_kmem_end(unsigned long); extern void uml_cleanup(void); -extern int pid_to_processor_id(int pid); extern void set_current(void *t); extern void lock_signalled_task(void *t); extern void IPI_handler(int cpu); @@ -101,7 +98,9 @@ extern int clear_user_proc(void *buf, int size); extern int copy_to_user_proc(void *to, void *from, int size); extern int copy_from_user_proc(void *to, void *from, int size); +extern int strlen_user_proc(char *str); extern void bus_handler(int sig, union uml_pt_regs *regs); +extern void winch(int sig, union uml_pt_regs *regs); extern long execute_syscall(void *r); extern int smp_sigio_handler(void); extern void *get_current(void); diff -Nru a/arch/um/include/kern_util.h~uml-summa.diff b/arch/um/include/kern_util.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/kern_util.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __KERN_UTIL_H__ +#define __KERN_UTIL_H__ + +#include "linux/threads.h" +#include "sysdep/ptrace.h" + +extern int ncpus; +extern char *linux_prog; +extern char *gdb_init; +extern int kmalloc_ok; +extern int timer_irq_inited; +extern int jail; +extern int nsyscalls; + +extern struct task_struct *idle_threads[NR_CPUS]; + +#define UML_ROUND_DOWN(addr) ((void *)(((unsigned long) addr) & PAGE_MASK)) +#define UML_ROUND_UP(addr) \ + UML_ROUND_DOWN(((unsigned long) addr) + PAGE_SIZE - 1) + +extern int kernel_fork(unsigned long flags, int (*fn)(void *), void * arg); +extern unsigned long stack_sp(unsigned long page); +extern int kernel_thread_proc(void *data); +extern void syscall_segv(int sig); +extern int current_pid(void); +extern unsigned long alloc_stack(int order, int atomic); +extern int do_signal(int error); +extern int is_stack_fault(unsigned long sp); +extern unsigned long segv(unsigned long address, unsigned long ip, + int is_write, int is_user, void *sc); +extern int handle_page_fault(unsigned long address, unsigned long ip, + int is_write, int is_user, int *code_out); +extern void syscall_ready(void); +extern void set_tracing(void *t, int tracing); +extern int is_tracing(void *task); +extern int segv_syscall(void); +extern void kern_finish_exec(void *task, int new_pid, unsigned long stack); +extern int page_size(void); +extern int page_mask(void); +extern int need_finish_fork(void); +extern void free_stack(unsigned long stack, int order); +extern void add_input_request(int op, void (*proc)(int), void *arg); +extern int sys_execve(char *file, char **argv, char **env); +extern char *current_cmd(void); +extern void timer_handler(int sig, union uml_pt_regs *regs); +extern int set_signals(int enable); +extern void force_sigbus(void); +extern int pid_to_processor_id(int pid); +extern void block_signals(void); +extern void unblock_signals(void); +extern void deliver_signals(void *t); +extern int next_syscall_index(int max); +extern int next_trap_index(int max); +extern void default_idle(void); +extern void finish_fork(void); +extern void paging_init(void); +extern void init_flush_vm(void); +extern void *syscall_sp(void *t); +extern void syscall_trace(void); +extern int hz(void); +extern void idle_timer(void); +extern unsigned int do_IRQ(int irq, union uml_pt_regs *regs); +extern int external_pid(void *t); +extern int pid_to_processor_id(int pid); +extern void boot_timer_handler(int sig); +extern void interrupt_end(void); +extern void initial_thread_cb(void (*proc)(void *), void *arg); +extern int debugger_signal(int status, int pid); +extern void debugger_parent_signal(int status, int pid); +extern void child_signal(int pid, int status); +extern int init_ptrace_proxy(int idle_pid, int startup, int stop); +extern int init_parent_proxy(int pid); +extern int singlestepping(void *t); +extern void check_stack_overflow(void *ptr); +extern void relay_signal(int sig, union uml_pt_regs *regs); +extern void not_implemented(void); +extern int user_context(unsigned long sp); +extern void timer_irq(union uml_pt_regs *regs); +extern void unprotect_stack(unsigned long stack); +extern void do_uml_exitcalls(void); +extern int attach_debugger(int idle_pid, int pid, int stop); +extern void bad_segv(unsigned long address, unsigned long ip, int is_write); +extern int config_gdb(char *str); +extern int remove_gdb(void); +extern char *uml_strdup(char *string); +extern void unprotect_kernel_mem(void); +extern void protect_kernel_mem(void); +extern void set_kmem_end(unsigned long); +extern void uml_cleanup(void); +extern int pid_to_processor_id(int pid); +extern void set_current(void *t); +extern void lock_signalled_task(void *t); +extern void IPI_handler(int cpu); +extern int jail_setup(char *line, int *add); +extern void *get_init_task(void); +extern int clear_user_proc(void *buf, int size); +extern int copy_to_user_proc(void *to, void *from, int size); +extern int copy_from_user_proc(void *to, void *from, int size); +extern void bus_handler(int sig, union uml_pt_regs *regs); +extern long execute_syscall(void *r); +extern int smp_sigio_handler(void); +extern void *get_current(void); +extern struct task_struct *get_task(int pid, int require); +extern void machine_halt(void); +extern int is_syscall(unsigned long addr); +extern void arch_switch(void); +extern void free_irq(unsigned int, void *); +extern int um_in_interrupt(void); +extern int cpu(void); +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/line.h b/arch/um/include/line.h --- a/arch/um/include/line.h Fri Oct 31 14:10:54 2003 +++ b/arch/um/include/line.h Fri Oct 31 14:10:54 2003 @@ -9,12 +9,14 @@ #include "linux/list.h" #include "linux/workqueue.h" #include "linux/tty.h" +#include "linux/interrupt.h" #include "asm/semaphore.h" #include "chan_user.h" #include "mconsole_kern.h" struct line_driver { char *name; + char *device_name; char *devfs_name; short major; short minor_start; @@ -67,8 +69,9 @@ #define LINES_INIT(n) { num : n } -extern void line_interrupt(int irq, void *data, struct pt_regs *unused); -extern void line_write_interrupt(int irq, void *data, struct pt_regs *unused); +extern irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused); +extern irqreturn_t line_write_interrupt(int irq, void *data, + struct pt_regs *unused); extern void line_close(struct line *lines, struct tty_struct *tty); extern int line_open(struct line *lines, struct tty_struct *tty, struct chan_opts *opts); diff -Nru a/arch/um/include/line.h~uml-summa.diff b/arch/um/include/line.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/line.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __LINE_H__ +#define __LINE_H__ + +#include "linux/list.h" +#include "linux/workqueue.h" +#include "linux/tty.h" +#include "asm/semaphore.h" +#include "chan_user.h" +#include "mconsole_kern.h" + +struct line_driver { + char *name; + char *devfs_name; + short major; + short minor_start; + short type; + short subtype; + int read_irq; + char *read_irq_name; + int write_irq; + char *write_irq_name; + char *symlink_from; + char *symlink_to; + struct mc_device mc; +}; + +struct line { + char *init_str; + int init_pri; + struct list_head chan_list; + int valid; + int count; + struct tty_struct *tty; + struct semaphore sem; + char *buffer; + char *head; + char *tail; + int sigio; + struct work_struct task; + struct line_driver *driver; + int have_irq; +}; + +#define LINE_INIT(str, d) \ + { init_str : str, \ + init_pri : INIT_STATIC, \ + chan_list : { }, \ + valid : 1, \ + count : 0, \ + tty : NULL, \ + sem : { }, \ + buffer : NULL, \ + head : NULL, \ + tail : NULL, \ + sigio : 0, \ + driver : d, \ + have_irq : 0 } + +struct lines { + int num; +}; + +#define LINES_INIT(n) { num : n } + +extern void line_interrupt(int irq, void *data, struct pt_regs *unused); +extern void line_write_interrupt(int irq, void *data, struct pt_regs *unused); +extern void line_close(struct line *lines, struct tty_struct *tty); +extern int line_open(struct line *lines, struct tty_struct *tty, + struct chan_opts *opts); +extern int line_setup(struct line *lines, int num, char *init, + int all_allowed); +extern int line_write(struct line *line, struct tty_struct *tty, int from_user, + const char *buf, int len); +extern int line_write_room(struct tty_struct *tty); +extern char *add_xterm_umid(char *base); +extern int line_setup_irq(int fd, int input, int output, void *data); +extern void line_close_chan(struct line *line); +extern void line_disable(struct line *line, int current_irq); +extern struct tty_driver * line_register_devfs(struct lines *set, + struct line_driver *line_driver, + struct tty_operations *driver, + struct line *lines, + int nlines); +extern void lines_init(struct line *lines, int nlines); +extern void close_lines(struct line *lines, int nlines); +extern int line_config(struct line *lines, int num, char *str); +extern int line_remove(struct line *lines, int num, char *str); +extern int line_get_config(char *dev, struct line *lines, int num, char *str, + int size, char **error_out); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/mconsole.h b/arch/um/include/mconsole.h --- a/arch/um/include/mconsole.h Fri Oct 31 14:10:53 2003 +++ b/arch/um/include/mconsole.h Fri Oct 31 14:10:53 2003 @@ -77,6 +77,7 @@ extern void mconsole_cad(struct mc_request *req); extern void mconsole_stop(struct mc_request *req); extern void mconsole_go(struct mc_request *req); +extern void mconsole_log(struct mc_request *req); extern int mconsole_get_request(int fd, struct mc_request *req); extern int mconsole_notify(char *sock_name, int type, const void *data, diff -Nru a/arch/um/include/mconsole.h~uml-summa.diff b/arch/um/include/mconsole.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/mconsole.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __MCONSOLE_H__ +#define __MCONSOLE_H__ + +#ifndef __KERNEL__ +#include <stdint.h> +#define u32 uint32_t +#endif + +#define MCONSOLE_MAGIC (0xcafebabe) +#define MCONSOLE_MAX_DATA (512) +#define MCONSOLE_VERSION 2 + +struct mconsole_request { + u32 magic; + u32 version; + u32 len; + char data[MCONSOLE_MAX_DATA]; +}; + +struct mconsole_reply { + u32 err; + u32 more; + u32 len; + char data[MCONSOLE_MAX_DATA]; +}; + +struct mconsole_notify { + u32 magic; + u32 version; + enum { MCONSOLE_SOCKET, MCONSOLE_PANIC, MCONSOLE_HANG, + MCONSOLE_USER_NOTIFY } type; + u32 len; + char data[MCONSOLE_MAX_DATA]; +}; + +struct mc_request; + +struct mconsole_command +{ + char *command; + void (*handler)(struct mc_request *req); + int as_interrupt; +}; + +struct mc_request +{ + int len; + int as_interrupt; + + int originating_fd; + int originlen; + unsigned char origin[128]; /* sockaddr_un */ + + struct mconsole_request request; + struct mconsole_command *cmd; +}; + +extern char mconsole_socket_name[]; + +extern int mconsole_unlink_socket(void); +extern int mconsole_reply(struct mc_request *req, char *reply, int err, + int more); + +extern void mconsole_version(struct mc_request *req); +extern void mconsole_help(struct mc_request *req); +extern void mconsole_halt(struct mc_request *req); +extern void mconsole_reboot(struct mc_request *req); +extern void mconsole_config(struct mc_request *req); +extern void mconsole_remove(struct mc_request *req); +extern void mconsole_sysrq(struct mc_request *req); +extern void mconsole_cad(struct mc_request *req); +extern void mconsole_stop(struct mc_request *req); +extern void mconsole_go(struct mc_request *req); + +extern int mconsole_get_request(int fd, struct mc_request *req); +extern int mconsole_notify(char *sock_name, int type, const void *data, + int len); +extern char *mconsole_notify_socket(void); +extern void lock_notify(void); +extern void unlock_notify(void); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/mem.h b/arch/um/include/mem.h --- a/arch/um/include/mem.h Fri Oct 31 14:10:54 2003 +++ b/arch/um/include/mem.h Fri Oct 31 14:10:54 2003 @@ -13,7 +13,6 @@ }; extern void set_usable_vm(unsigned long start, unsigned long end); -extern void set_kmem_end(unsigned long new); #endif diff -Nru a/arch/um/include/mem.h~uml-summa.diff b/arch/um/include/mem.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/mem.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __MEM_H__ +#define __MEM_H__ + +struct vm_reserved { + struct list_head list; + unsigned long start; + unsigned long end; +}; + +extern void set_usable_vm(unsigned long start, unsigned long end); +extern void set_kmem_end(unsigned long new); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/mem_user.h b/arch/um/include/mem_user.h --- a/arch/um/include/mem_user.h Fri Oct 31 14:10:54 2003 +++ b/arch/um/include/mem_user.h Fri Oct 31 14:10:54 2003 @@ -51,9 +51,6 @@ extern int init_mem_user(void); extern int create_mem_file(unsigned long len); -extern void setup_range(int fd, char *driver, unsigned long start, - unsigned long pfn, unsigned long total, int need_vm, - struct mem_region *region, void *reserved); extern void setup_memory(void *entry); extern unsigned long find_iomem(char *driver, unsigned long *len_out); extern int init_maps(struct mem_region *region); diff -Nru a/arch/um/include/mem_user.h~uml-summa.diff b/arch/um/include/mem_user.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/mem_user.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,87 @@ +/* + * arch/um/include/mem_user.h + * + * BRIEF MODULE DESCRIPTION + * user side memory interface for support IO memory inside user mode linux + * + * Copyright (C) 2001 RidgeRun, Inc. + * Author: RidgeRun, Inc. + * Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN + * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _MEM_USER_H +#define _MEM_USER_H + +struct mem_region { + char *driver; + unsigned long start_pfn; + unsigned long start; + unsigned long len; + void *mem_map; + int fd; +}; + +extern struct mem_region *regions[]; +extern struct mem_region physmem_region; + +#define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1)) + +extern unsigned long host_task_size; +extern unsigned long task_size; + +extern int init_mem_user(void); +extern int create_mem_file(unsigned long len); +extern void setup_range(int fd, char *driver, unsigned long start, + unsigned long pfn, unsigned long total, int need_vm, + struct mem_region *region, void *reserved); +extern void setup_memory(void *entry); +extern unsigned long find_iomem(char *driver, unsigned long *len_out); +extern int init_maps(struct mem_region *region); +extern int nregions(void); +extern int reserve_vm(unsigned long start, unsigned long end, void *e); +extern unsigned long get_vm(unsigned long len); +extern void setup_physmem(unsigned long start, unsigned long usable, + unsigned long len); +extern int setup_region(struct mem_region *region, void *entry); +extern void add_iomem(char *name, int fd, unsigned long size); +extern struct mem_region *phys_region(unsigned long phys); +extern unsigned long phys_offset(unsigned long phys); +extern void unmap_physmem(void); +extern int map_memory(unsigned long virt, unsigned long phys, + unsigned long len, int r, int w, int x); +extern int protect_memory(unsigned long addr, unsigned long len, + int r, int w, int x, int must_succeed); +extern unsigned long get_kmem_end(void); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/os.h b/arch/um/include/os.h --- a/arch/um/include/os.h Fri Oct 31 14:10:53 2003 +++ b/arch/um/include/os.h Fri Oct 31 14:10:53 2003 @@ -103,10 +103,11 @@ extern int os_shutdown_socket(int fd, int r, int w); extern void os_close_file(int fd); extern int os_rcv_fd(int fd, int *helper_pid_out); -extern int create_unix_socket(char *file, int len); +extern int create_unix_socket(char *file, int len, int close_on_exec); extern int os_connect_socket(char *name); extern int os_file_type(char *file); extern int os_file_mode(char *file, struct openflags *mode_out); +extern int os_lock_file(int fd, int excl); extern unsigned long os_process_pc(int pid); extern int os_process_parent(int pid); @@ -120,6 +121,7 @@ extern int os_protect_memory(void *addr, unsigned long len, int r, int w, int x); extern int os_unmap_memory(void *addr, int len); +extern void os_flush_stdout(void); #endif diff -Nru a/arch/um/include/os.h~uml-summa.diff b/arch/um/include/os.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/os.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __OS_H__ +#define __OS_H__ + +#include "asm/types.h" +#include "../os/include/file.h" + +#define OS_TYPE_FILE 1 +#define OS_TYPE_DIR 2 +#define OS_TYPE_SYMLINK 3 +#define OS_TYPE_CHARDEV 4 +#define OS_TYPE_BLOCKDEV 5 +#define OS_TYPE_FIFO 6 +#define OS_TYPE_SOCK 7 + +struct openflags { + unsigned int r : 1; + unsigned int w : 1; + unsigned int s : 1; /* O_SYNC */ + unsigned int c : 1; /* O_CREAT */ + unsigned int t : 1; /* O_TRUNC */ + unsigned int a : 1; /* O_APPEND */ + unsigned int e : 1; /* O_EXCL */ + unsigned int cl : 1; /* FD_CLOEXEC */ +}; + +#define OPENFLAGS() ((struct openflags) { .r = 0, .w = 0, .s = 0, .c = 0, \ + .t = 0, .a = 0, .e = 0, .cl = 0 }) + +static inline struct openflags of_read(struct openflags flags) +{ + flags.r = 1; + return(flags); +} + +static inline struct openflags of_write(struct openflags flags) +{ + flags.w = 1; + return(flags); +} + +static inline struct openflags of_rdwr(struct openflags flags) +{ + return(of_read(of_write(flags))); +} + +static inline struct openflags of_set_rw(struct openflags flags, int r, int w) +{ + flags.r = r; + flags.w = w; + return(flags); +} + +static inline struct openflags of_sync(struct openflags flags) +{ + flags.s = 1; + return(flags); +} + +static inline struct openflags of_create(struct openflags flags) +{ + flags.c = 1; + return(flags); +} + +static inline struct openflags of_trunc(struct openflags flags) +{ + flags.t = 1; + return(flags); +} + +static inline struct openflags of_append(struct openflags flags) +{ + flags.a = 1; + return(flags); +} + +static inline struct openflags of_excl(struct openflags flags) +{ + flags.e = 1; + return(flags); +} + +static inline struct openflags of_cloexec(struct openflags flags) +{ + flags.cl = 1; + return(flags); +} + +extern int os_seek_file(int fd, __u64 offset); +extern int os_open_file(char *file, struct openflags flags, int mode); +extern int os_read_file(int fd, void *buf, int len); +extern int os_write_file(int fd, void *buf, int count); +extern int os_file_size(char *file, long long *size_out); +extern int os_pipe(int *fd, int stream, int close_on_exec); +extern int os_set_fd_async(int fd, int owner); +extern int os_set_fd_block(int fd, int blocking); +extern int os_accept_connection(int fd); +extern int os_shutdown_socket(int fd, int r, int w); +extern void os_close_file(int fd); +extern int os_rcv_fd(int fd, int *helper_pid_out); +extern int create_unix_socket(char *file, int len); +extern int os_connect_socket(char *name); +extern int os_file_type(char *file); +extern int os_file_mode(char *file, struct openflags *mode_out); + +extern unsigned long os_process_pc(int pid); +extern int os_process_parent(int pid); +extern void os_stop_process(int pid); +extern void os_kill_process(int pid, int reap_child); +extern void os_usr1_process(int pid); +extern int os_getpid(void); + +extern int os_map_memory(void *virt, int fd, unsigned long off, + unsigned long len, int r, int w, int x); +extern int os_protect_memory(void *addr, unsigned long len, + int r, int w, int x); +extern int os_unmap_memory(void *addr, int len); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/sysdep-i386/checksum.h b/arch/um/include/sysdep-i386/checksum.h --- a/arch/um/include/sysdep-i386/checksum.h Fri Oct 31 14:10:53 2003 +++ b/arch/um/include/sysdep-i386/checksum.h Fri Oct 31 14:10:53 2003 @@ -6,6 +6,7 @@ #define __UM_SYSDEP_CHECKSUM_H #include "linux/string.h" +#include "linux/in6.h" /* * computes the checksum of a memory block at buff, length len, diff -Nru a/arch/um/include/sysdep-i386/checksum.h~uml-summa.diff b/arch/um/include/sysdep-i386/checksum.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/sysdep-i386/checksum.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,217 @@ +/* + * Licensed under the GPL + */ + +#ifndef __UM_SYSDEP_CHECKSUM_H +#define __UM_SYSDEP_CHECKSUM_H + +#include "linux/string.h" + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +unsigned int csum_partial(const unsigned char * buff, int len, + unsigned int sum); + +/* + * the same as csum_partial, but copies from src while it + * checksums, and handles user-space pointer exceptions correctly, when needed. + * + * here even more important to align src and dst on a 32-bit (or even + * better 64-bit) boundary + */ + +unsigned int csum_partial_copy_to(const char *src, char *dst, int len, + int sum, int *err_ptr); +unsigned int csum_partial_copy_from(const char *src, char *dst, int len, + int sum, int *err_ptr); + +/* + * Note: when you get a NULL pointer exception here this means someone + * passed in an incorrect kernel address to one of these functions. + * + * If you use these functions directly please don't forget the + * verify_area(). + */ + +static __inline__ +unsigned int csum_partial_copy_nocheck(const char *src, char *dst, + int len, int sum) +{ + memcpy(dst, src, len); + return(csum_partial(dst, len, sum)); +} + +static __inline__ +unsigned int csum_partial_copy_from_user(const char *src, char *dst, + int len, int sum, int *err_ptr) +{ + return csum_partial_copy_from(src, dst, len, sum, err_ptr); +} + +/* + * These are the old (and unsafe) way of doing checksums, a warning message + * will be printed if they are used and an exception occurs. + * + * these functions should go away after some time. + */ + +#define csum_partial_copy_fromuser csum_partial_copy_from_user +unsigned int csum_partial_copy( const char *src, char *dst, int len, int sum); + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * + * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by + * Arnt Gulbrandsen. + */ +static inline unsigned short ip_fast_csum(unsigned char * iph, + unsigned int ihl) +{ + unsigned int sum; + + __asm__ __volatile__( + "movl (%1), %0 ;\n" + "subl $4, %2 ;\n" + "jbe 2f ;\n" + "addl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0 ;\n" +"1: adcl 16(%1), %0 ;\n" + "lea 4(%1), %1 ;\n" + "decl %2 ;\n" + "jne 1b ;\n" + "adcl $0, %0 ;\n" + "movl %0, %2 ;\n" + "shrl $16, %0 ;\n" + "addw %w2, %w0 ;\n" + "adcl $0, %0 ;\n" + "notl %0 ;\n" +"2: ;\n" + /* Since the input registers which are loaded with iph and ipl + are modified, we must also specify them as outputs, or gcc + will assume they contain their original values. */ + : "=r" (sum), "=r" (iph), "=r" (ihl) + : "1" (iph), "2" (ihl)); + return(sum); +} + +/* + * Fold a partial checksum + */ + +static inline unsigned int csum_fold(unsigned int sum) +{ + __asm__( + "addl %1, %0 ;\n" + "adcl $0xffff, %0 ;\n" + : "=r" (sum) + : "r" (sum << 16), "0" (sum & 0xffff0000) + ); + return (~sum) >> 16; +} + +static inline unsigned long csum_tcpudp_nofold(unsigned long saddr, + unsigned long daddr, + unsigned short len, + unsigned short proto, + unsigned int sum) +{ + __asm__( + "addl %1, %0 ;\n" + "adcl %2, %0 ;\n" + "adcl %3, %0 ;\n" + "adcl $0, %0 ;\n" + : "=r" (sum) + : "g" (daddr), "g"(saddr), "g"((ntohs(len)<<16)+proto*256), "0"(sum)); + return sum; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +static inline unsigned short int csum_tcpudp_magic(unsigned long saddr, + unsigned long daddr, + unsigned short len, + unsigned short proto, + unsigned int sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); +} + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ + +static inline unsigned short ip_compute_csum(unsigned char * buff, int len) +{ + return csum_fold (csum_partial(buff, len, 0)); +} + +#define _HAVE_ARCH_IPV6_CSUM +static __inline__ unsigned short int csum_ipv6_magic(struct in6_addr *saddr, + struct in6_addr *daddr, + __u32 len, + unsigned short proto, + unsigned int sum) +{ + __asm__( + "addl 0(%1), %0 ;\n" + "adcl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0 ;\n" + "adcl 0(%2), %0 ;\n" + "adcl 4(%2), %0 ;\n" + "adcl 8(%2), %0 ;\n" + "adcl 12(%2), %0 ;\n" + "adcl %3, %0 ;\n" + "adcl %4, %0 ;\n" + "adcl $0, %0 ;\n" + : "=&r" (sum) + : "r" (saddr), "r" (daddr), + "r"(htonl(len)), "r"(htonl(proto)), "0"(sum)); + + return csum_fold(sum); +} + +/* + * Copy and checksum to user + */ +#define HAVE_CSUM_COPY_USER +static __inline__ unsigned int csum_and_copy_to_user(const char *src, + char *dst, int len, + int sum, int *err_ptr) +{ + if (access_ok(VERIFY_WRITE, dst, len)) + return(csum_partial_copy_to(src, dst, len, sum, err_ptr)); + + if (len) + *err_ptr = -EFAULT; + + return -1; /* invalid checksum */ +} + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/sysdep-i386/sigcontext.h b/arch/um/include/sysdep-i386/sigcontext.h --- a/arch/um/include/sysdep-i386/sigcontext.h Fri Oct 31 14:10:54 2003 +++ b/arch/um/include/sysdep-i386/sigcontext.h Fri Oct 31 14:10:54 2003 @@ -28,8 +28,8 @@ */ #define SC_START_SYSCALL(sc) do SC_EAX(sc) = -ENOSYS; while(0) -/* These are General Protection and Page Fault */ -#define SEGV_IS_FIXABLE(trap) ((trap == 13) || (trap == 14)) +/* This is Page Fault */ +#define SEGV_IS_FIXABLE(trap) (trap == 14) #define SC_SEGV_IS_FIXABLE(sc) (SEGV_IS_FIXABLE(SC_TRAPNO(sc))) diff -Nru a/arch/um/include/sysdep-i386/sigcontext.h~uml-summa.diff b/arch/um/include/sysdep-i386/sigcontext.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/sysdep-i386/sigcontext.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __SYS_SIGCONTEXT_I386_H +#define __SYS_SIGCONTEXT_I386_H + +#include "sc.h" + +#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) + +#define SC_RESTART_SYSCALL(sc) IP_RESTART_SYSCALL(SC_IP(sc)) +#define SC_SET_SYSCALL_RETURN(sc, result) SC_EAX(sc) = (result) + +#define SC_FAULT_ADDR(sc) SC_CR2(sc) +#define SC_FAULT_TYPE(sc) SC_ERR(sc) + +#define FAULT_WRITE(err) (err & 2) +#define TO_SC_ERR(is_write) ((is_write) ? 2 : 0) + +#define SC_FAULT_WRITE(sc) (FAULT_WRITE(SC_ERR(sc))) + +#define SC_TRAP_TYPE(sc) SC_TRAPNO(sc) + +/* ptrace expects that, at the start of a system call, %eax contains + * -ENOSYS, so this makes it so. + */ +#define SC_START_SYSCALL(sc) do SC_EAX(sc) = -ENOSYS; while(0) + +/* These are General Protection and Page Fault */ +#define SEGV_IS_FIXABLE(trap) ((trap == 13) || (trap == 14)) + +#define SC_SEGV_IS_FIXABLE(sc) (SEGV_IS_FIXABLE(SC_TRAPNO(sc))) + +extern unsigned long *sc_sigmask(void *sc_ptr); +extern int sc_get_fpregs(unsigned long buf, void *sc_ptr); + +#endif +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/time_user.h b/arch/um/include/time_user.h --- a/arch/um/include/time_user.h Fri Oct 31 14:10:53 2003 +++ b/arch/um/include/time_user.h Fri Oct 31 14:10:53 2003 @@ -13,5 +13,8 @@ extern void enable_timer(void); extern unsigned long time_lock(void); extern void time_unlock(unsigned long); +#ifndef NSEC_PER_SEC +#define NSEC_PER_SEC (1000000000L) +#endif #endif diff -Nru a/arch/um/include/time_user.h~uml-summa.diff b/arch/um/include/time_user.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/time_user.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __TIME_USER_H__ +#define __TIME_USER_H__ + +extern void timer(void); +extern void switch_timers(int to_real); +extern void set_interval(int timer_type); +extern void idle_sleep(int secs); +extern void enable_timer(void); +extern unsigned long time_lock(void); +extern void time_unlock(unsigned long); + +#endif diff -Nru a/arch/um/include/ubd_user.h b/arch/um/include/ubd_user.h --- a/arch/um/include/ubd_user.h Fri Oct 31 14:10:54 2003 +++ b/arch/um/include/ubd_user.h Fri Oct 31 14:10:54 2003 @@ -39,7 +39,6 @@ extern int write_ubd_fs(int fd, char *buffer, int len); extern int start_io_thread(unsigned long sp, int *fds_out); extern void do_io(struct io_thread_req *req); -extern int ubd_is_dir(char *file); static inline int ubd_test_bit(__u64 bit, unsigned char *data) { diff -Nru a/arch/um/include/ubd_user.h~uml-summa.diff b/arch/um/include/ubd_user.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/ubd_user.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Copyright (C) 2001 RidgeRun, Inc (glonnon@ridgerun.com) + * Licensed under the GPL + */ + +#ifndef __UM_UBD_USER_H +#define __UM_UBD_USER_H + +#include "os.h" + +enum ubd_req { UBD_READ, UBD_WRITE }; + +struct io_thread_req { + enum ubd_req op; + int fds[2]; + unsigned long offsets[2]; + unsigned long long offset; + unsigned long length; + char *buffer; + int sectorsize; + unsigned long sector_mask; + unsigned long cow_offset; + unsigned long bitmap_words[2]; + int error; +}; + +extern int open_ubd_file(char *file, struct openflags *openflags, + char **backing_file_out, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out, + int *create_cow_out); +extern int create_cow_file(char *cow_file, char *backing_file, + struct openflags flags, int sectorsize, + int *bitmap_offset_out, + unsigned long *bitmap_len_out, + int *data_offset_out); +extern int read_cow_bitmap(int fd, void *buf, int offset, int len); +extern int read_ubd_fs(int fd, void *buffer, int len); +extern int write_ubd_fs(int fd, char *buffer, int len); +extern int start_io_thread(unsigned long sp, int *fds_out); +extern void do_io(struct io_thread_req *req); +extern int ubd_is_dir(char *file); + +static inline int ubd_test_bit(__u64 bit, unsigned char *data) +{ + __u64 n; + int bits, off; + + bits = sizeof(data[0]) * 8; + n = bit / bits; + off = bit % bits; + return((data[n] & (1 << off)) != 0); +} + +static inline void ubd_set_bit(__u64 bit, unsigned char *data) +{ + __u64 n; + int bits, off; + + bits = sizeof(data[0]) * 8; + n = bit / bits; + off = bit % bits; + data[n] |= (1 << off); +} + + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/user.h b/arch/um/include/user.h --- a/arch/um/include/user.h Fri Oct 31 14:10:53 2003 +++ b/arch/um/include/user.h Fri Oct 31 14:10:53 2003 @@ -14,7 +14,7 @@ extern void kfree(void *ptr); extern int in_aton(char *str); extern int open_gdb_chan(void); - +extern int strlcpy(char *, const char *, int); #endif /* diff -Nru a/arch/um/include/user.h~uml-summa.diff b/arch/um/include/user.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/user.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __USER_H__ +#define __USER_H__ + +extern void panic(const char *fmt, ...); +extern int printk(const char *fmt, ...); +extern void schedule(void); +extern void *um_kmalloc(int size); +extern void *um_kmalloc_atomic(int size); +extern void kfree(void *ptr); +extern int in_aton(char *str); +extern int open_gdb_chan(void); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/include/user_util.h b/arch/um/include/user_util.h --- a/arch/um/include/user_util.h Fri Oct 31 14:10:53 2003 +++ b/arch/um/include/user_util.h Fri Oct 31 14:10:53 2003 @@ -59,7 +59,6 @@ extern void *add_signal_handler(int sig, void (*handler)(int)); extern int start_fork_tramp(void *arg, unsigned long temp_stack, int clone_flags, int (*tramp)(void *)); -extern int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags); extern int linux_main(int argc, char **argv); extern void set_cmdline(char *cmd); extern void input_cb(void (*proc)(void *), void *arg, int arg_len); @@ -90,7 +89,8 @@ extern int arch_fixup(unsigned long address, void *sc_ptr); extern void forward_pending_sigio(int target); extern int can_do_skas(void); - +extern void arch_init_thread(void); + #endif /* diff -Nru a/arch/um/include/user_util.h~uml-summa.diff b/arch/um/include/user_util.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/include/user_util.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __USER_UTIL_H__ +#define __USER_UTIL_H__ + +#include "sysdep/ptrace.h" + +extern int mode_tt; + +extern int grantpt(int __fd); +extern int unlockpt(int __fd); +extern char *ptsname(int __fd); + +enum { OP_NONE, OP_EXEC, OP_FORK, OP_TRACE_ON, OP_REBOOT, OP_HALT, OP_CB }; + +struct cpu_task { + int pid; + void *task; +}; + +extern struct cpu_task cpu_tasks[]; + +struct signal_info { + void (*handler)(int, union uml_pt_regs *); + int is_irq; +}; + +extern struct signal_info sig_info[]; + +extern unsigned long low_physmem; +extern unsigned long high_physmem; +extern unsigned long uml_physmem; +extern unsigned long uml_reserved; +extern unsigned long end_vm; +extern unsigned long start_vm; +extern unsigned long highmem; + +extern char host_info[]; + +extern char saved_command_line[]; +extern char command_line[]; + +extern char *tempdir; + +extern unsigned long _stext, _etext, _sdata, _edata, __bss_start, _end; +extern unsigned long _unprotected_end; +extern unsigned long brk_start; + +extern int pty_output_sigio; +extern int pty_close_sigio; + +extern void stop(void); +extern void stack_protections(unsigned long address); +extern void task_protections(unsigned long address); +extern int wait_for_stop(int pid, int sig, int cont_type, void *relay); +extern void *add_signal_handler(int sig, void (*handler)(int)); +extern int start_fork_tramp(void *arg, unsigned long temp_stack, + int clone_flags, int (*tramp)(void *)); +extern int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags); +extern int linux_main(int argc, char **argv); +extern void set_cmdline(char *cmd); +extern void input_cb(void (*proc)(void *), void *arg, int arg_len); +extern int get_pty(void); +extern void *um_kmalloc(int size); +extern int raw(int fd, int complain); +extern int switcheroo(int fd, int prot, void *from, void *to, int size); +extern void setup_machinename(char *machine_out); +extern void setup_hostinfo(void); +extern void add_arg(char *cmd_line, char *arg); +extern void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)); +extern void init_new_thread_signals(int altstack); +extern void do_exec(int old_pid, int new_pid); +extern void tracer_panic(char *msg, ...); +extern char *get_umid(int only_if_set); +extern void do_longjmp(void *p, int val); +extern void suspend_new_thread(int fd); +extern int detach(int pid, int sig); +extern int attach(int pid); +extern void kill_child_dead(int pid); +extern int cont(int pid); +extern void check_ptrace(void); +extern void check_sigio(void); +extern int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr); +extern void write_sigio_workaround(void); +extern void arch_check_bugs(void); +extern int arch_handle_signal(int sig, union uml_pt_regs *regs); +extern int arch_fixup(unsigned long address, void *sc_ptr); +extern void forward_pending_sigio(int target); +extern int can_do_skas(void); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile --- a/arch/um/kernel/Makefile Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/Makefile Fri Oct 31 14:10:54 2003 @@ -21,6 +21,8 @@ obj-$(CONFIG_MODE_TT) += tt/ obj-$(CONFIG_MODE_SKAS) += skas/ +clean-files := config.c + user-objs-$(CONFIG_TTY_LOG) += tty_log.o USER_OBJS := $(filter %_user.o,$(obj-y)) $(user-objs-y) config.o helper.o \ @@ -45,16 +47,12 @@ $(obj)/frame.o: $(src)/frame.c $(CC) $(CFLAGS_$(notdir $@)) -c -o $@ $< -QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }' +QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; $$config =~ s/\n/\\n"\n"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }' $(obj)/config.c : $(src)/config.c.in $(TOPDIR)/.config $(PERL) -e $(QUOTE) < $(src)/config.c.in > $@ $(obj)/config.o : $(obj)/config.c - -clean: - rm -f config.c - for dir in $(subdir-y) ; do $(MAKE) -C $$dir clean; done modules: diff -Nru a/arch/um/kernel/Makefile~uml-summa.diff b/arch/um/kernel/Makefile~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/Makefile~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,66 @@ +# +# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +extra-y := vmlinux.lds.s + +obj-y = checksum.o config.o exec_kern.o exitcode.o frame_kern.o frame.o \ + helper.o init_task.o irq.o irq_user.o ksyms.o mem.o mem_user.o \ + process.o process_kern.o ptrace.o reboot.o resource.o sigio_user.o \ + sigio_kern.o signal_kern.o signal_user.o smp.o syscall_kern.o \ + syscall_user.o sysrq.o sys_call_table.o tempfile.o time.o \ + time_kern.o tlb.o trap_kern.o trap_user.o uaccess_user.o um_arch.o \ + umid.o user_syms.o user_util.o + +obj-$(CONFIG_BLK_DEV_INITRD) += initrd_kern.o initrd_user.o +obj-$(CONFIG_GPROF) += gprof_syms.o +obj-$(CONFIG_GCOV) += gmon_syms.o +obj-$(CONFIG_TTY_LOG) += tty_log.o + +obj-$(CONFIG_MODE_TT) += tt/ +obj-$(CONFIG_MODE_SKAS) += skas/ + +user-objs-$(CONFIG_TTY_LOG) += tty_log.o + +USER_OBJS := $(filter %_user.o,$(obj-y)) $(user-objs-y) config.o helper.o \ + process.o tempfile.o time.o tty_log.o umid.o user_util.o user_syms.o +USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) + +DMODULES-$(CONFIG_MODULES) = -D__CONFIG_MODULES__ +DMODVERSIONS-$(CONFIG_MODVERSIONS) = -D__CONFIG_MODVERSIONS__ + + +CFLAGS_user_syms.o = -D__AUTOCONF_INCLUDED__ $(DMODULES-y) $(DMODVERSIONS-y) \ + -I/usr/include -I../include + +CFLAGS_frame.o := $(patsubst -fomit-frame-pointer,,$(USER_CFLAGS)) + +$(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< + +# This has to be separate because it needs be compiled with frame pointers +# regardless of how the rest of the kernel is built. + +$(obj)/frame.o: $(src)/frame.c + $(CC) $(CFLAGS_$(notdir $@)) -c -o $@ $< + +QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }' + +$(obj)/config.c : $(src)/config.c.in $(TOPDIR)/.config + $(PERL) -e $(QUOTE) < $(src)/config.c.in > $@ + +$(obj)/config.o : $(obj)/config.c + +clean: + rm -f config.c + for dir in $(subdir-y) ; do $(MAKE) -C $$dir clean; done + +modules: + +fastdep: + +dep: + +archmrproper: clean + diff -Nru a/arch/um/kernel/config.c.in b/arch/um/kernel/config.c.in --- a/arch/um/kernel/config.c.in Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/config.c.in Fri Oct 31 14:10:54 2003 @@ -7,9 +7,7 @@ #include <stdlib.h> #include "init.h" -static __initdata char *config = " -CONFIG -"; +static __initdata char *config = "CONFIG"; static int __init print_config(char *line, int *add) { diff -Nru a/arch/um/kernel/config.c.in~uml-summa.diff b/arch/um/kernel/config.c.in~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/config.c.in~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <stdlib.h> +#include "init.h" + +static __initdata char *config = " +CONFIG +"; + +static int __init print_config(char *line, int *add) +{ + printf("%s", config); + exit(0); +} + +__uml_setup("--showconfig", print_config, +"--showconfig\n" +" Prints the config file that this UML binary was generated from.\n\n" +); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/exec_kern.c b/arch/um/kernel/exec_kern.c --- a/arch/um/kernel/exec_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/exec_kern.c Fri Oct 31 14:10:53 2003 @@ -32,10 +32,15 @@ CHOOSE_MODE_PROC(start_thread_tt, start_thread_skas, regs, eip, esp); } +extern void log_exec(char **argv, void *tty); + static int execve1(char *file, char **argv, char **env) { int error; +#ifdef CONFIG_TTY_LOG + log_exec(argv, current->tty); +#endif error = do_execve(file, argv, env, ¤t->thread.regs); if (error == 0){ current->ptrace &= ~PT_DTRACE; diff -Nru a/arch/um/kernel/exec_kern.c~uml-summa.diff b/arch/um/kernel/exec_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/exec_kern.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/slab.h" +#include "linux/smp_lock.h" +#include "linux/ptrace.h" +#include "asm/ptrace.h" +#include "asm/pgtable.h" +#include "asm/tlbflush.h" +#include "asm/uaccess.h" +#include "user_util.h" +#include "kern_util.h" +#include "mem_user.h" +#include "kern.h" +#include "irq_user.h" +#include "tlb.h" +#include "2_5compat.h" +#include "os.h" +#include "time_user.h" +#include "choose-mode.h" +#include "mode_kern.h" + +void flush_thread(void) +{ + CHOOSE_MODE(flush_thread_tt(), flush_thread_skas()); +} + +void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) +{ + CHOOSE_MODE_PROC(start_thread_tt, start_thread_skas, regs, eip, esp); +} + +static int execve1(char *file, char **argv, char **env) +{ + int error; + + error = do_execve(file, argv, env, ¤t->thread.regs); + if (error == 0){ + current->ptrace &= ~PT_DTRACE; + set_cmdline(current_cmd()); + } + return(error); +} + +int um_execve(char *file, char **argv, char **env) +{ + int err; + + err = execve1(file, argv, env); + if(!err) + do_longjmp(current->thread.exec_buf, 1); + return(err); +} + +int sys_execve(char *file, char **argv, char **env) +{ + int error; + char *filename; + + lock_kernel(); + filename = getname((char *) file); + error = PTR_ERR(filename); + if (IS_ERR(filename)) goto out; + error = execve1(filename, argv, env); + putname(filename); + out: + unlock_kernel(); + return(error); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c --- a/arch/um/kernel/init_task.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/init_task.c Fri Oct 31 14:10:54 2003 @@ -18,6 +18,7 @@ struct mm_struct init_mm = INIT_MM(init_mm); static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); EXPORT_SYMBOL(init_mm); @@ -43,24 +44,10 @@ __attribute__((__section__(".data.init_task"))) = { INIT_THREAD_INFO(init_task) }; -struct task_struct *alloc_task_struct(void) -{ - return((struct task_struct *) - __get_free_pages(GFP_KERNEL, CONFIG_KERNEL_STACK_ORDER)); -} - void unprotect_stack(unsigned long stack) { protect_memory(stack, (1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE, 1, 1, 0, 1); -} - -void free_task_struct(struct task_struct *task) -{ - /* free_pages decrements the page counter and only actually frees - * the pages if they are now not accessed by anything. - */ - free_pages((unsigned long) task, CONFIG_KERNEL_STACK_ORDER); } /* diff -Nru a/arch/um/kernel/init_task.c~uml-summa.diff b/arch/um/kernel/init_task.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/init_task.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/mm.h" +#include "linux/module.h" +#include "linux/sched.h" +#include "linux/init_task.h" +#include "linux/version.h" +#include "asm/uaccess.h" +#include "asm/pgtable.h" +#include "user_util.h" +#include "mem_user.h" + +static struct fs_struct init_fs = INIT_FS; +struct mm_struct init_mm = INIT_MM(init_mm); +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); + +EXPORT_SYMBOL(init_mm); + +/* + * Initial task structure. + * + * All other task structs will be allocated on slabs in fork.c + */ + +struct task_struct init_task = INIT_TASK(init_task); + +EXPORT_SYMBOL(init_task); + +/* + * Initial thread structure. + * + * We need to make sure that this is 16384-byte aligned due to the + * way process stacks are handled. This is done by having a special + * "init_task" linker map entry.. + */ + +union thread_union init_thread_union +__attribute__((__section__(".data.init_task"))) = +{ INIT_THREAD_INFO(init_task) }; + +struct task_struct *alloc_task_struct(void) +{ + return((struct task_struct *) + __get_free_pages(GFP_KERNEL, CONFIG_KERNEL_STACK_ORDER)); +} + +void unprotect_stack(unsigned long stack) +{ + protect_memory(stack, (1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE, + 1, 1, 0, 1); +} + +void free_task_struct(struct task_struct *task) +{ + /* free_pages decrements the page counter and only actually frees + * the pages if they are now not accessed by anything. + */ + free_pages((unsigned long) task, CONFIG_KERNEL_STACK_ORDER); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c --- a/arch/um/kernel/irq.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/irq.c Fri Oct 31 14:10:54 2003 @@ -83,65 +83,52 @@ end_none }; -/* Not changed */ -volatile unsigned long irq_err_count; - /* * Generic, controller-independent functions: */ -int get_irq_list(char *buf) +int show_interrupts(struct seq_file *p, void *v) { int i, j; - unsigned long flags; struct irqaction * action; - char *p = buf; + unsigned long flags; - p += sprintf(p, " "); - for (j=0; j<num_online_cpus(); j++) - p += sprintf(p, "CPU%d ",j); - *p++ = '\n'; + seq_printf(p, " "); + for (j=0; j<NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "CPU%d ",j); + seq_putc(p, '\n'); for (i = 0 ; i < NR_IRQS ; i++) { spin_lock_irqsave(&irq_desc[i].lock, flags); action = irq_desc[i].action; if (!action) - goto end; - p += sprintf(p, "%3d: ",i); + goto skip; + seq_printf(p, "%3d: ",i); #ifndef CONFIG_SMP - p += sprintf(p, "%10u ", kstat_irqs(i)); + seq_printf(p, "%10u ", kstat_irqs(i)); #else - for (j = 0; j < num_online_cpus(); j++) - p += sprintf(p, "%10u ", - kstat_cpu(cpu_logical_map(j)).irqs[i]); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - p += sprintf(p, " %14s", irq_desc[i].handler->typename); - p += sprintf(p, " %s", action->name); + seq_printf(p, " %14s", irq_desc[i].handler->typename); + seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) - p += sprintf(p, ", %s", action->name); - *p++ = '\n'; - end: + seq_printf(p, ", %s", action->name); + + seq_putc(p, '\n'); +skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); } - p += sprintf(p, "\n"); -#ifdef notdef -#ifdef CONFIG_SMP - p += sprintf(p, "LOC: "); - for (j = 0; j < num_online_cpus(); j++) - p += sprintf(p, "%10u ", - apic_timer_irqs[cpu_logical_map(j)]); - p += sprintf(p, "\n"); -#endif -#endif - p += sprintf(p, "ERR: %10lu\n", irq_err_count); - return p - buf; -} + seq_printf(p, "NMI: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", nmi_count(j)); + seq_putc(p, '\n'); - -int show_interrupts(struct seq_file *p, void *v) -{ - return(0); + return 0; } /* @@ -282,13 +269,12 @@ * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int cpu = smp_processor_id(); irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; irq_enter(); - kstat_cpu(cpu).irqs[irq]++; + kstat_this_cpu.irqs[irq]++; spin_lock(&desc->lock); desc->handler->ack(irq); /* @@ -385,7 +371,7 @@ */ int request_irq(unsigned int irq, - void (*handler)(int, void *, struct pt_regs *), + irqreturn_t (*handler)(int, void *, struct pt_regs *), unsigned long irqflags, const char * devname, void *dev_id) @@ -433,15 +419,19 @@ EXPORT_SYMBOL(request_irq); int um_request_irq(unsigned int irq, int fd, int type, - void (*handler)(int, void *, struct pt_regs *), + irqreturn_t (*handler)(int, void *, struct pt_regs *), unsigned long irqflags, const char * devname, void *dev_id) { - int retval; + int err; - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if(retval) return(retval); - return(activate_fd(irq, fd, type, dev_id)); + err = request_irq(irq, handler, irqflags, devname, dev_id); + if(err) + return(err); + + if(fd != -1) + err = activate_fd(irq, fd, type, dev_id); + return(err); } /* this was setup_x86_irq but it seems pretty generic */ @@ -659,7 +649,7 @@ return -EINVAL; tmp = *mask; for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { - int j = sprintf(page, "%04hx", cpus_coerce(tmp)); + int j = sprintf(page, "%04hx", (short) cpus_coerce(tmp)); len += j; page += j; cpus_shift_right(tmp, tmp, 16); diff -Nru a/arch/um/kernel/irq.c~uml-kill-irq_kern.h.diff b/arch/um/kernel/irq.c~uml-kill-irq_kern.h.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/irq.c~uml-kill-irq_kern.h.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,859 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c: + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + */ + +#include "linux/config.h" +#include "linux/kernel.h" +#include "linux/module.h" +#include "linux/smp.h" +#include "linux/irq.h" +#include "linux/kernel_stat.h" +#include "linux/interrupt.h" +#include "linux/random.h" +#include "linux/slab.h" +#include "linux/file.h" +#include "linux/proc_fs.h" +#include "linux/init.h" +#include "linux/seq_file.h" +#include "asm/irq.h" +#include "asm/hw_irq.h" +#include "asm/hardirq.h" +#include "asm/atomic.h" +#include "asm/signal.h" +#include "asm/system.h" +#include "asm/errno.h" +#include "asm/uaccess.h" +#include "user_util.h" +#include "kern_util.h" +#include "irq_user.h" +#include "irq_kern.h" + +static void register_irq_proc (unsigned int irq); + +irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { + [0 ... NR_IRQS-1] = { + .handler = &no_irq_type, + .lock = SPIN_LOCK_UNLOCKED + } +}; + +/* + * Generic no controller code + */ + +static void enable_none(unsigned int irq) { } +static unsigned int startup_none(unsigned int irq) { return 0; } +static void disable_none(unsigned int irq) { } +static void ack_none(unsigned int irq) +{ +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves, it doesn't deserve + * a generic callback i think. + */ +#ifdef CONFIG_X86 + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + */ + ack_APIC_irq(); +#endif +#endif +} + +/* startup is the same as "enable", shutdown is same as "disable" */ +#define shutdown_none disable_none +#define end_none enable_none + +struct hw_interrupt_type no_irq_type = { + "none", + startup_none, + shutdown_none, + enable_none, + disable_none, + ack_none, + end_none +}; + +/* + * Generic, controller-independent functions: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i, j; + struct irqaction * action; + unsigned long flags; + + seq_printf(p, " "); + for (j=0; j<NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "CPU%d ",j); + seq_putc(p, '\n'); + + for (i = 0 ; i < NR_IRQS ; i++) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %14s", irq_desc[i].handler->typename); + seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } + seq_printf(p, "NMI: "); + for (j = 0; j < NR_CPUS; j++) + if (cpu_online(j)) + seq_printf(p, "%10u ", nmi_count(j)); + seq_putc(p, '\n'); + + return 0; +} + +/* + * This should really return information about whether + * we should do bottom half handling etc. Right now we + * end up _always_ checking the bottom half, which is a + * waste of time and is not what some drivers would + * prefer. + */ +int handle_IRQ_event(unsigned int irq, struct pt_regs * regs, + struct irqaction * action) +{ + int status = 1; /* Force the "do bottom halves" bit */ + + if (!(action->flags & SA_INTERRUPT)) + local_irq_enable(); + + do { + status |= action->flags; + action->handler(irq, action->dev_id, regs); + action = action->next; + } while (action); + if (status & SA_SAMPLE_RANDOM) + add_interrupt_randomness(irq); + + local_irq_disable(); + + return status; +} + +/* + * Generic enable/disable code: this just calls + * down into the PIC-specific version for the actual + * hardware disable after having gotten the irq + * controller lock. + */ + +/** + * disable_irq_nosync - disable an irq without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables of an interrupt + * stack. Unlike disable_irq(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. + * + * This function may be called from IRQ context. + */ + +inline void disable_irq_nosync(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->depth++) { + desc->status |= IRQ_DISABLED; + desc->handler->disable(irq); + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +#ifdef CONFIG_SMP +inline void synchronize_irq(unsigned int irq) +{ + /* is there anything to synchronize with? */ + if (!irq_desc[irq].action) + return; + + while (irq_desc[irq].status & IRQ_INPROGRESS) + cpu_relax(); +} +#endif + +/** + * disable_irq - disable an irq and wait for completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables of an interrupt + * stack. That is for two disables you need two enables. This + * function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ + +void disable_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + synchronize_irq(irq); +} + +/** + * enable_irq - enable interrupt handling on an irq + * @irq: Interrupt to enable + * + * Re-enables the processing of interrupts on this IRQ line + * providing no disable_irq calls are now in effect. + * + * This function may be called from IRQ context. + */ + +void enable_irq(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + switch (desc->depth) { + case 1: { + unsigned int status = desc->status & ~IRQ_DISABLED; + desc->status = status; + if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + desc->status = status | IRQ_REPLAY; + hw_resend_irq(desc->handler,irq); + } + desc->handler->enable(irq); + /* fall-through */ + } + default: + desc->depth--; + break; + case 0: + printk(KERN_ERR "enable_irq() unbalanced from %p\n", + __builtin_return_address(0)); + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int do_IRQ(int irq, union uml_pt_regs *regs) +{ + /* + * 0 return value means that this irq is already being + * handled by some other CPU. (or is disabled) + */ + irq_desc_t *desc = irq_desc + irq; + struct irqaction * action; + unsigned int status; + + irq_enter(); + kstat_this_cpu.irqs[irq]++; + spin_lock(&desc->lock); + desc->handler->ack(irq); + /* + REPLAY is when Linux resends an IRQ that was dropped earlier + WAITING is used by probe to mark irqs that are being tested + */ + status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); + status |= IRQ_PENDING; /* we _want_ to handle it */ + + /* + * If the IRQ is disabled for whatever reason, we cannot + * use the action we have. + */ + action = NULL; + if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { + action = desc->action; + status &= ~IRQ_PENDING; /* we commit to handling */ + status |= IRQ_INPROGRESS; /* we are handling it */ + } + desc->status = status; + + /* + * If there is no IRQ handler or it was disabled, exit early. + Since we set PENDING, if another processor is handling + a different instance of this same irq, the other processor + will take care of it. + */ + if (!action) + goto out; + + /* + * Edge triggered interrupts need to remember + * pending events. + * This applies to any hw interrupts that allow a second + * instance of the same irq to arrive while we are in do_IRQ + * or in the handler. But the code here only handles the _second_ + * instance of the irq, not the third or fourth. So it is mostly + * useful for irq hardware that does not mask cleanly in an + * SMP environment. + */ + for (;;) { + spin_unlock(&desc->lock); + handle_IRQ_event(irq, (struct pt_regs *) regs, action); + spin_lock(&desc->lock); + + if (!(desc->status & IRQ_PENDING)) + break; + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; +out: + /* + * The ->end() handler has to deal with interrupts which got + * disabled while the handler was running. + */ + desc->handler->end(irq); + spin_unlock(&desc->lock); + + irq_exit(); + + return 1; +} + +/** + * request_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. From the point this + * call is made your handler function may be invoked. Since + * your handler function must clear any interrupt the board + * raises, you must take care both to initialise your hardware + * and to set up the interrupt handler in the right order. + * + * Dev_id must be globally unique. Normally the address of the + * device data structure is used as the cookie. Since the handler + * receives this value it makes sense to use it. + * + * If your interrupt is shared you must pass a non NULL dev_id + * as this is required when freeing the interrupt. + * + * Flags: + * + * SA_SHIRQ Interrupt is shared + * + * SA_INTERRUPT Disable local interrupts while processing + * + * SA_SAMPLE_RANDOM The interrupt can be used for entropy + * + */ + +int request_irq(unsigned int irq, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char * devname, + void *dev_id) +{ + int retval; + struct irqaction * action; + +#if 1 + /* + * Sanity-check: shared interrupts should REALLY pass in + * a real dev-ID, otherwise we'll have trouble later trying + * to figure out which interrupt is which (messes up the + * interrupt freeing logic etc). + */ + if (irqflags & SA_SHIRQ) { + if (!dev_id) + printk(KERN_ERR "Bad boy: %s (at 0x%x) called us " + "without a dev_id!\n", devname, (&irq)[-1]); + } +#endif + + if (irq >= NR_IRQS) + return -EINVAL; + if (!handler) + return -EINVAL; + + action = (struct irqaction *) + kmalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = irqflags; + action->mask = 0; + action->name = devname; + action->next = NULL; + action->dev_id = dev_id; + + retval = setup_irq(irq, action); + if (retval) + kfree(action); + return retval; +} + +EXPORT_SYMBOL(request_irq); + +int um_request_irq(unsigned int irq, int fd, int type, + irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, const char * devname, + void *dev_id) +{ + int err; + + err = request_irq(irq, handler, irqflags, devname, dev_id); + if(err) + return(err); + + if(fd != -1) + err = activate_fd(irq, fd, type, dev_id); + return(err); +} + +/* this was setup_x86_irq but it seems pretty generic */ +int setup_irq(unsigned int irq, struct irqaction * new) +{ + int shared = 0; + unsigned long flags; + struct irqaction *old, **p; + irq_desc_t *desc = irq_desc + irq; + + /* + * Some drivers like serial.c use request_irq() heavily, + * so we have to be careful not to interfere with a + * running system. + */ + if (new->flags & SA_SAMPLE_RANDOM) { + /* + * This function might sleep, we want to call it first, + * outside of the atomic block. + * Yes, this might clear the entropy pool if the wrong + * driver is attempted to be loaded, without actually + * installing a new handler, but is this really a problem, + * only the sysadmin is able to do this. + */ + rand_initialize_irq(irq); + } + + /* + * The following block of code has to be executed atomically + */ + spin_lock_irqsave(&desc->lock,flags); + p = &desc->action; + if ((old = *p) != NULL) { + /* Can't share interrupts unless both agree to */ + if (!(old->flags & new->flags & SA_SHIRQ)) { + spin_unlock_irqrestore(&desc->lock,flags); + return -EBUSY; + } + + /* add new interrupt at end of irq queue */ + do { + p = &old->next; + old = *p; + } while (old); + shared = 1; + } + + *p = new; + + if (!shared) { + desc->depth = 0; + desc->status &= ~IRQ_DISABLED; + desc->handler->startup(irq); + } + spin_unlock_irqrestore(&desc->lock,flags); + + register_irq_proc(irq); + return 0; +} + +/** + * free_irq - free an interrupt + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove an interrupt handler. The handler is removed and if the + * interrupt line is no longer in use by any driver it is disabled. + * On a shared IRQ the caller must ensure the interrupt is disabled + * on the card it drives before calling this function. The function + * does not return until any executing interrupts for this IRQ + * have completed. + * + * This function may be called from interrupt context. + * + * Bugs: Attempting to free an irq in a handler for the same irq hangs + * the machine. + */ + +void free_irq(unsigned int irq, void *dev_id) +{ + irq_desc_t *desc; + struct irqaction **p; + unsigned long flags; + + if (irq >= NR_IRQS) + return; + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock,flags); + p = &desc->action; + for (;;) { + struct irqaction * action = *p; + if (action) { + struct irqaction **pp = p; + p = &action->next; + if (action->dev_id != dev_id) + continue; + + /* Found it - now remove it from the list of entries */ + *pp = action->next; + if (!desc->action) { + desc->status |= IRQ_DISABLED; + desc->handler->shutdown(irq); + } + free_irq_by_irq_and_dev(irq, dev_id); + spin_unlock_irqrestore(&desc->lock,flags); + + /* Wait to make sure it's not being used on another CPU */ + synchronize_irq(irq); + kfree(action); + return; + } + printk(KERN_ERR "Trying to free free IRQ%d\n",irq); + spin_unlock_irqrestore(&desc->lock,flags); + return; + } +} + +EXPORT_SYMBOL(free_irq); + +/* These are initialized by sysctl_init, which is called from init/main.c */ +static struct proc_dir_entry * root_irq_dir; +static struct proc_dir_entry * irq_dir [NR_IRQS]; +static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; + +/* These are read and written as longs, so a read won't see a partial write + * even during a race. + */ +static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; + +#define HEX_DIGITS (2*sizeof(cpumask_t)) + +static int irq_affinity_read_proc (char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + if (count < HEX_DIGITS+1) + return -EINVAL; + return sprintf (page, "%08lx\n", irq_affinity[(long)data]); +} + +static unsigned int parse_hex_value (const char *buffer, + unsigned long count, cpumask_t *ret) +{ + unsigned char hexnum [HEX_DIGITS]; + cpumask_t value = CPU_MASK_NONE; + int i; + + if (!count) + return -EINVAL; + if (count > HEX_DIGITS) + count = HEX_DIGITS; + if (copy_from_user(hexnum, buffer, count)) + return -EFAULT; + + /* + * Parse the first 8 characters as a hex string, any non-hex char + * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. + */ + + for (i = 0; i < count; i++) { + unsigned int k, c = hexnum[i]; + + switch (c) { + case '0' ... '9': c -= '0'; break; + case 'a' ... 'f': c -= 'a'-10; break; + case 'A' ... 'F': c -= 'A'-10; break; + default: + goto out; + } + cpus_shift_left(value, value, 16); + for (k = 0; k < 4; ++k) + if (c & (1 << k)) + cpu_set(k, value); + } +out: + *ret = value; + return 0; +} + +static int irq_affinity_write_proc (struct file *file, const char *buffer, + unsigned long count, void *data) +{ + int irq = (long) data, full_count = count, err; + cpumask_t new_value, tmp; + + if (!irq_desc[irq].handler->set_affinity) + return -EIO; + + err = parse_hex_value(buffer, count, &new_value); + +#ifdef CONFIG_SMP + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) + return -EINVAL; +#endif + + irq_affinity[irq] = new_value; + irq_desc[irq].handler->set_affinity(irq, new_value); + + return full_count; +} + +static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + cpumask_t tmp, *mask = (cpumask_t *) data; + int k, len = 0; + + if (count < HEX_DIGITS+1) + return -EINVAL; + tmp = *mask; + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", (short) cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; +} + +static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, + unsigned long count, void *data) +{ + cpumask_t *mask = (cpumask_t *)data, new_value; + unsigned long full_count = count, err; + + err = parse_hex_value(buffer, count, &new_value); + if (err) + return err; + + *mask = new_value; + return full_count; +} + +#define MAX_NAMELEN 10 + +static void register_irq_proc (unsigned int irq) +{ + struct proc_dir_entry *entry; + char name [MAX_NAMELEN]; + + if (!root_irq_dir || (irq_desc[irq].handler == &no_irq_type) || + irq_dir[irq]) + return; + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%d", irq); + + /* create /proc/irq/1234 */ + irq_dir[irq] = proc_mkdir(name, root_irq_dir); + + /* create /proc/irq/1234/smp_affinity */ + entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); + + entry->nlink = 1; + entry->data = (void *)(long)irq; + entry->read_proc = irq_affinity_read_proc; + entry->write_proc = irq_affinity_write_proc; + + smp_affinity_entry[irq] = entry; +} + +/* Read and written as a long */ +cpumask_t prof_cpu_mask = CPU_MASK_ALL; + +void __init init_irq_proc (void) +{ + struct proc_dir_entry *entry; + int i; + + /* create /proc/irq */ + root_irq_dir = proc_mkdir("irq", 0); + + /* create /proc/irq/prof_cpu_mask */ + entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); + + entry->nlink = 1; + entry->data = (void *)&prof_cpu_mask; + entry->read_proc = prof_cpu_mask_read_proc; + entry->write_proc = prof_cpu_mask_write_proc; + + /* + * Create entries for all existing IRQs. + */ + for (i = 0; i < NR_IRQS; i++) + register_irq_proc(i); +} + +static spinlock_t irq_spinlock = SPIN_LOCK_UNLOCKED; + +unsigned long irq_lock(void) +{ + unsigned long flags; + + spin_lock_irqsave(&irq_spinlock, flags); + return(flags); +} + +void irq_unlock(unsigned long flags) +{ + spin_unlock_irqrestore(&irq_spinlock, flags); +} + +unsigned long probe_irq_on(void) +{ + return(0); +} + +EXPORT_SYMBOL(probe_irq_on); + +int probe_irq_off(unsigned long val) +{ + return(0); +} + +EXPORT_SYMBOL(probe_irq_off); + +static unsigned int startup_SIGIO_irq(unsigned int irq) +{ + return(0); +} + +static void shutdown_SIGIO_irq(unsigned int irq) +{ +} + +static void enable_SIGIO_irq(unsigned int irq) +{ +} + +static void disable_SIGIO_irq(unsigned int irq) +{ +} + +static void mask_and_ack_SIGIO(unsigned int irq) +{ +} + +static void end_SIGIO_irq(unsigned int irq) +{ +} + +static unsigned int startup_SIGVTALRM_irq(unsigned int irq) +{ + return(0); +} + +static void shutdown_SIGVTALRM_irq(unsigned int irq) +{ +} + +static void enable_SIGVTALRM_irq(unsigned int irq) +{ +} + +static void disable_SIGVTALRM_irq(unsigned int irq) +{ +} + +static void mask_and_ack_SIGVTALRM(unsigned int irq) +{ +} + +static void end_SIGVTALRM_irq(unsigned int irq) +{ +} + +static struct hw_interrupt_type SIGIO_irq_type = { + "SIGIO", + startup_SIGIO_irq, + shutdown_SIGIO_irq, + enable_SIGIO_irq, + disable_SIGIO_irq, + mask_and_ack_SIGIO, + end_SIGIO_irq, + NULL +}; + +static struct hw_interrupt_type SIGVTALRM_irq_type = { + "SIGVTALRM", + startup_SIGVTALRM_irq, + shutdown_SIGVTALRM_irq, + enable_SIGVTALRM_irq, + disable_SIGVTALRM_irq, + mask_and_ack_SIGVTALRM, + end_SIGVTALRM_irq, + NULL +}; + +void __init init_IRQ(void) +{ + int i; + + irq_desc[TIMER_IRQ].status = IRQ_DISABLED; + irq_desc[TIMER_IRQ].action = 0; + irq_desc[TIMER_IRQ].depth = 1; + irq_desc[TIMER_IRQ].handler = &SIGVTALRM_irq_type; + enable_irq(TIMER_IRQ); + for(i=1;i<NR_IRQS;i++){ + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = 0; + irq_desc[i].depth = 1; + irq_desc[i].handler = &SIGIO_irq_type; + enable_irq(i); + } + init_irq_signals(0); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/irq.c~uml-summa.diff b/arch/um/kernel/irq.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/irq.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,868 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c: + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + */ + +#include "linux/config.h" +#include "linux/kernel.h" +#include "linux/module.h" +#include "linux/smp.h" +#include "linux/irq.h" +#include "linux/kernel_stat.h" +#include "linux/interrupt.h" +#include "linux/random.h" +#include "linux/slab.h" +#include "linux/file.h" +#include "linux/proc_fs.h" +#include "linux/init.h" +#include "linux/seq_file.h" +#include "asm/irq.h" +#include "asm/hw_irq.h" +#include "asm/hardirq.h" +#include "asm/atomic.h" +#include "asm/signal.h" +#include "asm/system.h" +#include "asm/errno.h" +#include "asm/uaccess.h" +#include "user_util.h" +#include "kern_util.h" +#include "irq_user.h" + +static void register_irq_proc (unsigned int irq); + +irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { + [0 ... NR_IRQS-1] = { + .handler = &no_irq_type, + .lock = SPIN_LOCK_UNLOCKED + } +}; + +/* + * Generic no controller code + */ + +static void enable_none(unsigned int irq) { } +static unsigned int startup_none(unsigned int irq) { return 0; } +static void disable_none(unsigned int irq) { } +static void ack_none(unsigned int irq) +{ +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves, it doesn't deserve + * a generic callback i think. + */ +#ifdef CONFIG_X86 + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + */ + ack_APIC_irq(); +#endif +#endif +} + +/* startup is the same as "enable", shutdown is same as "disable" */ +#define shutdown_none disable_none +#define end_none enable_none + +struct hw_interrupt_type no_irq_type = { + "none", + startup_none, + shutdown_none, + enable_none, + disable_none, + ack_none, + end_none +}; + +/* Not changed */ +volatile unsigned long irq_err_count; + +/* + * Generic, controller-independent functions: + */ + +int get_irq_list(char *buf) +{ + int i, j; + unsigned long flags; + struct irqaction * action; + char *p = buf; + + p += sprintf(p, " "); + for (j=0; j<num_online_cpus(); j++) + p += sprintf(p, "CPU%d ",j); + *p++ = '\n'; + + for (i = 0 ; i < NR_IRQS ; i++) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto end; + p += sprintf(p, "%3d: ",i); +#ifndef CONFIG_SMP + p += sprintf(p, "%10u ", kstat_irqs(i)); +#else + for (j = 0; j < num_online_cpus(); j++) + p += sprintf(p, "%10u ", + kstat_cpu(cpu_logical_map(j)).irqs[i]); +#endif + p += sprintf(p, " %14s", irq_desc[i].handler->typename); + p += sprintf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) + p += sprintf(p, ", %s", action->name); + *p++ = '\n'; + end: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } + p += sprintf(p, "\n"); +#ifdef notdef +#ifdef CONFIG_SMP + p += sprintf(p, "LOC: "); + for (j = 0; j < num_online_cpus(); j++) + p += sprintf(p, "%10u ", + apic_timer_irqs[cpu_logical_map(j)]); + p += sprintf(p, "\n"); +#endif +#endif + p += sprintf(p, "ERR: %10lu\n", irq_err_count); + return p - buf; +} + + +int show_interrupts(struct seq_file *p, void *v) +{ + return(0); +} + +/* + * This should really return information about whether + * we should do bottom half handling etc. Right now we + * end up _always_ checking the bottom half, which is a + * waste of time and is not what some drivers would + * prefer. + */ +int handle_IRQ_event(unsigned int irq, struct pt_regs * regs, + struct irqaction * action) +{ + int status = 1; /* Force the "do bottom halves" bit */ + + if (!(action->flags & SA_INTERRUPT)) + local_irq_enable(); + + do { + status |= action->flags; + action->handler(irq, action->dev_id, regs); + action = action->next; + } while (action); + if (status & SA_SAMPLE_RANDOM) + add_interrupt_randomness(irq); + + local_irq_disable(); + + return status; +} + +/* + * Generic enable/disable code: this just calls + * down into the PIC-specific version for the actual + * hardware disable after having gotten the irq + * controller lock. + */ + +/** + * disable_irq_nosync - disable an irq without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables of an interrupt + * stack. Unlike disable_irq(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. + * + * This function may be called from IRQ context. + */ + +inline void disable_irq_nosync(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->depth++) { + desc->status |= IRQ_DISABLED; + desc->handler->disable(irq); + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +#ifdef CONFIG_SMP +inline void synchronize_irq(unsigned int irq) +{ + /* is there anything to synchronize with? */ + if (!irq_desc[irq].action) + return; + + while (irq_desc[irq].status & IRQ_INPROGRESS) + cpu_relax(); +} +#endif + +/** + * disable_irq - disable an irq and wait for completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables of an interrupt + * stack. That is for two disables you need two enables. This + * function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ + +void disable_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + synchronize_irq(irq); +} + +/** + * enable_irq - enable interrupt handling on an irq + * @irq: Interrupt to enable + * + * Re-enables the processing of interrupts on this IRQ line + * providing no disable_irq calls are now in effect. + * + * This function may be called from IRQ context. + */ + +void enable_irq(unsigned int irq) +{ + irq_desc_t *desc = irq_desc + irq; + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + switch (desc->depth) { + case 1: { + unsigned int status = desc->status & ~IRQ_DISABLED; + desc->status = status; + if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + desc->status = status | IRQ_REPLAY; + hw_resend_irq(desc->handler,irq); + } + desc->handler->enable(irq); + /* fall-through */ + } + default: + desc->depth--; + break; + case 0: + printk(KERN_ERR "enable_irq() unbalanced from %p\n", + __builtin_return_address(0)); + } + spin_unlock_irqrestore(&desc->lock, flags); +} + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +unsigned int do_IRQ(int irq, union uml_pt_regs *regs) +{ + /* + * 0 return value means that this irq is already being + * handled by some other CPU. (or is disabled) + */ + int cpu = smp_processor_id(); + irq_desc_t *desc = irq_desc + irq; + struct irqaction * action; + unsigned int status; + + irq_enter(); + kstat_cpu(cpu).irqs[irq]++; + spin_lock(&desc->lock); + desc->handler->ack(irq); + /* + REPLAY is when Linux resends an IRQ that was dropped earlier + WAITING is used by probe to mark irqs that are being tested + */ + status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); + status |= IRQ_PENDING; /* we _want_ to handle it */ + + /* + * If the IRQ is disabled for whatever reason, we cannot + * use the action we have. + */ + action = NULL; + if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { + action = desc->action; + status &= ~IRQ_PENDING; /* we commit to handling */ + status |= IRQ_INPROGRESS; /* we are handling it */ + } + desc->status = status; + + /* + * If there is no IRQ handler or it was disabled, exit early. + Since we set PENDING, if another processor is handling + a different instance of this same irq, the other processor + will take care of it. + */ + if (!action) + goto out; + + /* + * Edge triggered interrupts need to remember + * pending events. + * This applies to any hw interrupts that allow a second + * instance of the same irq to arrive while we are in do_IRQ + * or in the handler. But the code here only handles the _second_ + * instance of the irq, not the third or fourth. So it is mostly + * useful for irq hardware that does not mask cleanly in an + * SMP environment. + */ + for (;;) { + spin_unlock(&desc->lock); + handle_IRQ_event(irq, (struct pt_regs *) regs, action); + spin_lock(&desc->lock); + + if (!(desc->status & IRQ_PENDING)) + break; + desc->status &= ~IRQ_PENDING; + } + desc->status &= ~IRQ_INPROGRESS; +out: + /* + * The ->end() handler has to deal with interrupts which got + * disabled while the handler was running. + */ + desc->handler->end(irq); + spin_unlock(&desc->lock); + + irq_exit(); + + return 1; +} + +/** + * request_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. From the point this + * call is made your handler function may be invoked. Since + * your handler function must clear any interrupt the board + * raises, you must take care both to initialise your hardware + * and to set up the interrupt handler in the right order. + * + * Dev_id must be globally unique. Normally the address of the + * device data structure is used as the cookie. Since the handler + * receives this value it makes sense to use it. + * + * If your interrupt is shared you must pass a non NULL dev_id + * as this is required when freeing the interrupt. + * + * Flags: + * + * SA_SHIRQ Interrupt is shared + * + * SA_INTERRUPT Disable local interrupts while processing + * + * SA_SAMPLE_RANDOM The interrupt can be used for entropy + * + */ + +int request_irq(unsigned int irq, + void (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char * devname, + void *dev_id) +{ + int retval; + struct irqaction * action; + +#if 1 + /* + * Sanity-check: shared interrupts should REALLY pass in + * a real dev-ID, otherwise we'll have trouble later trying + * to figure out which interrupt is which (messes up the + * interrupt freeing logic etc). + */ + if (irqflags & SA_SHIRQ) { + if (!dev_id) + printk(KERN_ERR "Bad boy: %s (at 0x%x) called us " + "without a dev_id!\n", devname, (&irq)[-1]); + } +#endif + + if (irq >= NR_IRQS) + return -EINVAL; + if (!handler) + return -EINVAL; + + action = (struct irqaction *) + kmalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = irqflags; + action->mask = 0; + action->name = devname; + action->next = NULL; + action->dev_id = dev_id; + + retval = setup_irq(irq, action); + if (retval) + kfree(action); + return retval; +} + +EXPORT_SYMBOL(request_irq); + +int um_request_irq(unsigned int irq, int fd, int type, + void (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, const char * devname, + void *dev_id) +{ + int retval; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if(retval) return(retval); + return(activate_fd(irq, fd, type, dev_id)); +} + +/* this was setup_x86_irq but it seems pretty generic */ +int setup_irq(unsigned int irq, struct irqaction * new) +{ + int shared = 0; + unsigned long flags; + struct irqaction *old, **p; + irq_desc_t *desc = irq_desc + irq; + + /* + * Some drivers like serial.c use request_irq() heavily, + * so we have to be careful not to interfere with a + * running system. + */ + if (new->flags & SA_SAMPLE_RANDOM) { + /* + * This function might sleep, we want to call it first, + * outside of the atomic block. + * Yes, this might clear the entropy pool if the wrong + * driver is attempted to be loaded, without actually + * installing a new handler, but is this really a problem, + * only the sysadmin is able to do this. + */ + rand_initialize_irq(irq); + } + + /* + * The following block of code has to be executed atomically + */ + spin_lock_irqsave(&desc->lock,flags); + p = &desc->action; + if ((old = *p) != NULL) { + /* Can't share interrupts unless both agree to */ + if (!(old->flags & new->flags & SA_SHIRQ)) { + spin_unlock_irqrestore(&desc->lock,flags); + return -EBUSY; + } + + /* add new interrupt at end of irq queue */ + do { + p = &old->next; + old = *p; + } while (old); + shared = 1; + } + + *p = new; + + if (!shared) { + desc->depth = 0; + desc->status &= ~IRQ_DISABLED; + desc->handler->startup(irq); + } + spin_unlock_irqrestore(&desc->lock,flags); + + register_irq_proc(irq); + return 0; +} + +/** + * free_irq - free an interrupt + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove an interrupt handler. The handler is removed and if the + * interrupt line is no longer in use by any driver it is disabled. + * On a shared IRQ the caller must ensure the interrupt is disabled + * on the card it drives before calling this function. The function + * does not return until any executing interrupts for this IRQ + * have completed. + * + * This function may be called from interrupt context. + * + * Bugs: Attempting to free an irq in a handler for the same irq hangs + * the machine. + */ + +void free_irq(unsigned int irq, void *dev_id) +{ + irq_desc_t *desc; + struct irqaction **p; + unsigned long flags; + + if (irq >= NR_IRQS) + return; + + desc = irq_desc + irq; + spin_lock_irqsave(&desc->lock,flags); + p = &desc->action; + for (;;) { + struct irqaction * action = *p; + if (action) { + struct irqaction **pp = p; + p = &action->next; + if (action->dev_id != dev_id) + continue; + + /* Found it - now remove it from the list of entries */ + *pp = action->next; + if (!desc->action) { + desc->status |= IRQ_DISABLED; + desc->handler->shutdown(irq); + } + free_irq_by_irq_and_dev(irq, dev_id); + spin_unlock_irqrestore(&desc->lock,flags); + + /* Wait to make sure it's not being used on another CPU */ + synchronize_irq(irq); + kfree(action); + return; + } + printk(KERN_ERR "Trying to free free IRQ%d\n",irq); + spin_unlock_irqrestore(&desc->lock,flags); + return; + } +} + +EXPORT_SYMBOL(free_irq); + +/* These are initialized by sysctl_init, which is called from init/main.c */ +static struct proc_dir_entry * root_irq_dir; +static struct proc_dir_entry * irq_dir [NR_IRQS]; +static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; + +/* These are read and written as longs, so a read won't see a partial write + * even during a race. + */ +static cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; + +#define HEX_DIGITS (2*sizeof(cpumask_t)) + +static int irq_affinity_read_proc (char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + if (count < HEX_DIGITS+1) + return -EINVAL; + return sprintf (page, "%08lx\n", irq_affinity[(long)data]); +} + +static unsigned int parse_hex_value (const char *buffer, + unsigned long count, cpumask_t *ret) +{ + unsigned char hexnum [HEX_DIGITS]; + cpumask_t value = CPU_MASK_NONE; + int i; + + if (!count) + return -EINVAL; + if (count > HEX_DIGITS) + count = HEX_DIGITS; + if (copy_from_user(hexnum, buffer, count)) + return -EFAULT; + + /* + * Parse the first 8 characters as a hex string, any non-hex char + * is end-of-string. '00e1', 'e1', '00E1', 'E1' are all the same. + */ + + for (i = 0; i < count; i++) { + unsigned int k, c = hexnum[i]; + + switch (c) { + case '0' ... '9': c -= '0'; break; + case 'a' ... 'f': c -= 'a'-10; break; + case 'A' ... 'F': c -= 'A'-10; break; + default: + goto out; + } + cpus_shift_left(value, value, 16); + for (k = 0; k < 4; ++k) + if (c & (1 << k)) + cpu_set(k, value); + } +out: + *ret = value; + return 0; +} + +static int irq_affinity_write_proc (struct file *file, const char *buffer, + unsigned long count, void *data) +{ + int irq = (long) data, full_count = count, err; + cpumask_t new_value, tmp; + + if (!irq_desc[irq].handler->set_affinity) + return -EIO; + + err = parse_hex_value(buffer, count, &new_value); + +#ifdef CONFIG_SMP + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + cpus_and(tmp, new_value, cpu_online_map); + if (cpus_empty(tmp)) + return -EINVAL; +#endif + + irq_affinity[irq] = new_value; + irq_desc[irq].handler->set_affinity(irq, new_value); + + return full_count; +} + +static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + cpumask_t tmp, *mask = (cpumask_t *) data; + int k, len = 0; + + if (count < HEX_DIGITS+1) + return -EINVAL; + tmp = *mask; + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { + int j = sprintf(page, "%04hx", cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); + } + len += sprintf(page, "\n"); + return len; +} + +static int prof_cpu_mask_write_proc (struct file *file, const char *buffer, + unsigned long count, void *data) +{ + cpumask_t *mask = (cpumask_t *)data, new_value; + unsigned long full_count = count, err; + + err = parse_hex_value(buffer, count, &new_value); + if (err) + return err; + + *mask = new_value; + return full_count; +} + +#define MAX_NAMELEN 10 + +static void register_irq_proc (unsigned int irq) +{ + struct proc_dir_entry *entry; + char name [MAX_NAMELEN]; + + if (!root_irq_dir || (irq_desc[irq].handler == &no_irq_type) || + irq_dir[irq]) + return; + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%d", irq); + + /* create /proc/irq/1234 */ + irq_dir[irq] = proc_mkdir(name, root_irq_dir); + + /* create /proc/irq/1234/smp_affinity */ + entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); + + entry->nlink = 1; + entry->data = (void *)(long)irq; + entry->read_proc = irq_affinity_read_proc; + entry->write_proc = irq_affinity_write_proc; + + smp_affinity_entry[irq] = entry; +} + +/* Read and written as a long */ +cpumask_t prof_cpu_mask = CPU_MASK_ALL; + +void __init init_irq_proc (void) +{ + struct proc_dir_entry *entry; + int i; + + /* create /proc/irq */ + root_irq_dir = proc_mkdir("irq", 0); + + /* create /proc/irq/prof_cpu_mask */ + entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); + + entry->nlink = 1; + entry->data = (void *)&prof_cpu_mask; + entry->read_proc = prof_cpu_mask_read_proc; + entry->write_proc = prof_cpu_mask_write_proc; + + /* + * Create entries for all existing IRQs. + */ + for (i = 0; i < NR_IRQS; i++) + register_irq_proc(i); +} + +static spinlock_t irq_spinlock = SPIN_LOCK_UNLOCKED; + +unsigned long irq_lock(void) +{ + unsigned long flags; + + spin_lock_irqsave(&irq_spinlock, flags); + return(flags); +} + +void irq_unlock(unsigned long flags) +{ + spin_unlock_irqrestore(&irq_spinlock, flags); +} + +unsigned long probe_irq_on(void) +{ + return(0); +} + +EXPORT_SYMBOL(probe_irq_on); + +int probe_irq_off(unsigned long val) +{ + return(0); +} + +EXPORT_SYMBOL(probe_irq_off); + +static unsigned int startup_SIGIO_irq(unsigned int irq) +{ + return(0); +} + +static void shutdown_SIGIO_irq(unsigned int irq) +{ +} + +static void enable_SIGIO_irq(unsigned int irq) +{ +} + +static void disable_SIGIO_irq(unsigned int irq) +{ +} + +static void mask_and_ack_SIGIO(unsigned int irq) +{ +} + +static void end_SIGIO_irq(unsigned int irq) +{ +} + +static unsigned int startup_SIGVTALRM_irq(unsigned int irq) +{ + return(0); +} + +static void shutdown_SIGVTALRM_irq(unsigned int irq) +{ +} + +static void enable_SIGVTALRM_irq(unsigned int irq) +{ +} + +static void disable_SIGVTALRM_irq(unsigned int irq) +{ +} + +static void mask_and_ack_SIGVTALRM(unsigned int irq) +{ +} + +static void end_SIGVTALRM_irq(unsigned int irq) +{ +} + +static struct hw_interrupt_type SIGIO_irq_type = { + "SIGIO", + startup_SIGIO_irq, + shutdown_SIGIO_irq, + enable_SIGIO_irq, + disable_SIGIO_irq, + mask_and_ack_SIGIO, + end_SIGIO_irq, + NULL +}; + +static struct hw_interrupt_type SIGVTALRM_irq_type = { + "SIGVTALRM", + startup_SIGVTALRM_irq, + shutdown_SIGVTALRM_irq, + enable_SIGVTALRM_irq, + disable_SIGVTALRM_irq, + mask_and_ack_SIGVTALRM, + end_SIGVTALRM_irq, + NULL +}; + +void __init init_IRQ(void) +{ + int i; + + irq_desc[TIMER_IRQ].status = IRQ_DISABLED; + irq_desc[TIMER_IRQ].action = 0; + irq_desc[TIMER_IRQ].depth = 1; + irq_desc[TIMER_IRQ].handler = &SIGVTALRM_irq_type; + enable_irq(TIMER_IRQ); + for(i=1;i<NR_IRQS;i++){ + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = 0; + irq_desc[i].depth = 1; + irq_desc[i].handler = &SIGIO_irq_type; + enable_irq(i); + } + init_irq_signals(0); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c --- a/arch/um/kernel/ksyms.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/ksyms.c Fri Oct 31 14:10:53 2003 @@ -90,3 +90,5 @@ EXPORT_SYMBOL(kmap_atomic_to_page); #endif +EXPORT_SYMBOL(do_gettimeofday); +EXPORT_SYMBOL(do_settimeofday); diff -Nru a/arch/um/kernel/ksyms.c~uml-export-in-ksyms.c.diff b/arch/um/kernel/ksyms.c~uml-export-in-ksyms.c.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/ksyms.c~uml-export-in-ksyms.c.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/module.h" +#include "linux/string.h" +#include "linux/smp_lock.h" +#include "linux/spinlock.h" +#include <linux/highmem.h> +#include "asm/current.h" +#include "asm/delay.h" +#include "asm/processor.h" +#include "asm/unistd.h" +#include "asm/pgalloc.h" +#include "asm/pgtable.h" +#include "asm/page.h" +#include "asm/tlbflush.h" +#include "kern_util.h" +#include "user_util.h" +#include "os.h" +#include "helper.h" + +EXPORT_SYMBOL(stop); +EXPORT_SYMBOL(uml_physmem); +EXPORT_SYMBOL(set_signals); +EXPORT_SYMBOL(get_signals); +EXPORT_SYMBOL(kernel_thread); +EXPORT_SYMBOL(__const_udelay); +EXPORT_SYMBOL(__udelay); +EXPORT_SYMBOL(sys_waitpid); +EXPORT_SYMBOL(task_size); +EXPORT_SYMBOL(flush_tlb_range); +EXPORT_SYMBOL(host_task_size); +EXPORT_SYMBOL(arch_validate); + +EXPORT_SYMBOL(region_pa); +EXPORT_SYMBOL(region_va); +EXPORT_SYMBOL(phys_mem_map); +EXPORT_SYMBOL(page_mem_map); +EXPORT_SYMBOL(page_to_phys); +EXPORT_SYMBOL(phys_to_page); +EXPORT_SYMBOL(high_physmem); +EXPORT_SYMBOL(empty_zero_page); +EXPORT_SYMBOL(um_virt_to_phys); +EXPORT_SYMBOL(mode_tt); +EXPORT_SYMBOL(handle_page_fault); + +EXPORT_SYMBOL(os_getpid); +EXPORT_SYMBOL(os_open_file); +EXPORT_SYMBOL(os_read_file); +EXPORT_SYMBOL(os_write_file); +EXPORT_SYMBOL(os_seek_file); +EXPORT_SYMBOL(os_pipe); +EXPORT_SYMBOL(os_file_type); +EXPORT_SYMBOL(os_close_file); +EXPORT_SYMBOL(helper_wait); +EXPORT_SYMBOL(os_shutdown_socket); +EXPORT_SYMBOL(os_connect_socket); +EXPORT_SYMBOL(run_helper); +EXPORT_SYMBOL(start_thread); +EXPORT_SYMBOL(dump_thread); + +/* This is here because UML expands open to sys_open, not to a system + * call instruction. + */ +EXPORT_SYMBOL(sys_open); +EXPORT_SYMBOL(sys_lseek); +EXPORT_SYMBOL(sys_read); +EXPORT_SYMBOL(sys_wait4); + +#ifdef CONFIG_SMP + +/* required for SMP */ + +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +EXPORT_SYMBOL_NOVERS(__write_lock_failed); + +extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +EXPORT_SYMBOL_NOVERS(__read_lock_failed); + +#endif + +#ifdef CONFIG_HIGHMEM +EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kunmap); +EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kunmap_atomic); +EXPORT_SYMBOL(kmap_atomic_to_page); +#endif + diff -Nru a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c --- a/arch/um/kernel/mem.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/mem.c Fri Oct 31 14:10:53 2003 @@ -120,11 +120,6 @@ return(kmem_top); } -void set_kmem_end(unsigned long new) -{ - kmem_top = new; -} - #ifdef CONFIG_HIGHMEM /* Changed during early boot */ pte_t *kmap_pte; @@ -222,7 +217,7 @@ if(regions[i] == NULL) break; } if(i == NREGIONS){ - printk("setup_range : no free regions\n"); + printk("setup_one_range : no free regions\n"); i = -1; goto out; } @@ -231,7 +226,9 @@ fd = create_mem_file(len); if(region == NULL){ - region = alloc_bootmem_low_pages(sizeof(*region)); + if(kmalloc_ok) + region = kmalloc(sizeof(*region), GFP_KERNEL); + else region = alloc_bootmem_low_pages(sizeof(*region)); if(region == NULL) panic("Failed to allocating mem_region"); } @@ -532,9 +529,9 @@ return(NREGIONS); } -void setup_range(int fd, char *driver, unsigned long start, unsigned long pfn, - unsigned long len, int need_vm, struct mem_region *region, - void *reserved) +static void setup_range(int fd, char *driver, unsigned long start, + unsigned long pfn, unsigned long len, int need_vm, + struct mem_region *region, void *reserved) { int i, cur; diff -Nru a/arch/um/kernel/mem.c~uml-summa.diff b/arch/um/kernel/mem.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/mem.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,844 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/module.h" +#include "linux/types.h" +#include "linux/mm.h" +#include "linux/fs.h" +#include "linux/init.h" +#include "linux/bootmem.h" +#include "linux/swap.h" +#include "linux/slab.h" +#include "linux/vmalloc.h" +#include "linux/highmem.h" +#include "asm/page.h" +#include "asm/pgtable.h" +#include "asm/pgalloc.h" +#include "asm/bitops.h" +#include "asm/uaccess.h" +#include "asm/tlb.h" +#include "user_util.h" +#include "kern_util.h" +#include "mem_user.h" +#include "mem.h" +#include "kern.h" +#include "init.h" +#include "os.h" +#include "mode_kern.h" +#include "uml_uaccess.h" + +/* Changed during early boot */ +pgd_t swapper_pg_dir[1024]; +unsigned long high_physmem; +unsigned long vm_start; +unsigned long vm_end; +unsigned long highmem; +unsigned long *empty_zero_page = NULL; +unsigned long *empty_bad_page = NULL; + +/* Not modified */ +const char bad_pmd_string[] = "Bad pmd in pte_alloc: %08lx\n"; + +extern char __init_begin, __init_end; +extern long physmem_size; + +/* Not changed by UML */ +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); + +/* Changed during early boot */ +int kmalloc_ok = 0; + +#define NREGIONS (phys_region_index(0xffffffff) - phys_region_index(0x0) + 1) +struct mem_region *regions[NREGIONS] = { [ 0 ... NREGIONS - 1 ] = NULL }; +#define REGION_SIZE ((0xffffffff & ~REGION_MASK) + 1) + +/* Changed during early boot */ +static unsigned long brk_end; + +static void map_cb(void *unused) +{ + map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); +} + +void unmap_physmem(void) +{ + os_unmap_memory((void *) brk_end, uml_reserved - brk_end); +} + +extern char __binary_start; + +void mem_init(void) +{ + unsigned long start; + + max_low_pfn = (high_physmem - uml_physmem) >> PAGE_SHIFT; +#ifdef CONFIG_HIGHMEM + highmem_start_page = phys_page(__pa(high_physmem)); +#endif + + /* clear the zero-page */ + memset((void *) empty_zero_page, 0, PAGE_SIZE); + + /* Map in the area just after the brk now that kmalloc is about + * to be turned on. + */ + brk_end = (unsigned long) UML_ROUND_UP(sbrk(0)); + map_cb(NULL); + initial_thread_cb(map_cb, NULL); + free_bootmem(__pa(brk_end), uml_reserved - brk_end); + uml_reserved = brk_end; + + /* Fill in any hole at the start of the binary */ + start = (unsigned long) &__binary_start; + if(uml_physmem != start){ + map_memory(uml_physmem, __pa(uml_physmem), start - uml_physmem, + 1, 1, 0); + } + + /* this will put all low memory onto the freelists */ + totalram_pages = free_all_bootmem(); + totalhigh_pages = highmem >> PAGE_SHIFT; + totalram_pages += totalhigh_pages; + num_physpages = totalram_pages; + max_mapnr = totalram_pages; + max_pfn = totalram_pages; + printk(KERN_INFO "Memory: %luk available\n", + (unsigned long) nr_free_pages() << (PAGE_SHIFT-10)); + kmalloc_ok = 1; +} + +/* Changed during early boot */ +static unsigned long kmem_top = 0; + +unsigned long get_kmem_end(void) +{ + if(kmem_top == 0) + kmem_top = CHOOSE_MODE(kmem_end_tt, kmem_end_skas); + return(kmem_top); +} + +void set_kmem_end(unsigned long new) +{ + kmem_top = new; +} + +#ifdef CONFIG_HIGHMEM +/* Changed during early boot */ +pte_t *kmap_pte; +pgprot_t kmap_prot; + +EXPORT_SYMBOL(kmap_prot); +EXPORT_SYMBOL(kmap_pte); + +#define kmap_get_fixmap_pte(vaddr) \ + pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + +void __init kmap_init(void) +{ + unsigned long kmap_vstart; + + /* cache the first kmap pte */ + kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); + kmap_pte = kmap_get_fixmap_pte(kmap_vstart); + + kmap_prot = PAGE_KERNEL; +} +#endif /* CONFIG_HIGHMEM */ + +static void __init fixrange_init(unsigned long start, unsigned long end, + pgd_t *pgd_base) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int i, j; + unsigned long vaddr; + + vaddr = start; + i = pgd_index(vaddr); + j = pmd_index(vaddr); + pgd = pgd_base + i; + + for ( ; (i < PTRS_PER_PGD) && (vaddr < end); pgd++, i++) { + pmd = (pmd_t *)pgd; + for (; (j < PTRS_PER_PMD) && (vaddr != end); pmd++, j++) { + if (pmd_none(*pmd)) { + pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pmd(pmd, __pmd(_KERNPG_TABLE + + (unsigned long) __pa(pte))); + if (pte != pte_offset_kernel(pmd, 0)) + BUG(); + } + vaddr += PMD_SIZE; + } + j = 0; + } +} + +int init_maps(struct mem_region *region) +{ + struct page *p, *map; + int i, n, len; + + if(region == &physmem_region){ + region->mem_map = mem_map; + return(0); + } + else if(region->mem_map != NULL) return(0); + + n = region->len >> PAGE_SHIFT; + len = n * sizeof(struct page); + if(kmalloc_ok){ + map = kmalloc(len, GFP_KERNEL); + if(map == NULL) map = vmalloc(len); + } + else map = alloc_bootmem_low_pages(len); + + if(map == NULL) + return(-ENOMEM); + for(i = 0; i < n; i++){ + p = &map[i]; + set_page_count(p, 0); + SetPageReserved(p); + INIT_LIST_HEAD(&p->list); + } + region->mem_map = map; + return(0); +} + +DECLARE_MUTEX(regions_sem); + +static int setup_one_range(int fd, char *driver, unsigned long start, + unsigned long pfn, int len, + struct mem_region *region) +{ + int i; + + down(®ions_sem); + for(i = 0; i < NREGIONS; i++){ + if(regions[i] == NULL) break; + } + if(i == NREGIONS){ + printk("setup_range : no free regions\n"); + i = -1; + goto out; + } + + if(fd == -1) + fd = create_mem_file(len); + + if(region == NULL){ + region = alloc_bootmem_low_pages(sizeof(*region)); + if(region == NULL) + panic("Failed to allocating mem_region"); + } + + *region = ((struct mem_region) { .driver = driver, + .start_pfn = pfn, + .start = start, + .len = len, + .fd = fd } ); + regions[i] = region; + out: + up(®ions_sem); + return(i); +} + +#ifdef CONFIG_HIGHMEM +static void init_highmem(void) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long vaddr; + + /* + * Permanent kmaps: + */ + vaddr = PKMAP_BASE; + fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; + + kmap_init(); +} + +void setup_highmem(unsigned long len) +{ + struct mem_region *region; + struct page *page, *map; + unsigned long phys; + int i, cur, index; + + phys = physmem_size; + do { + cur = min(len, (unsigned long) REGION_SIZE); + i = setup_one_range(-1, NULL, -1, phys >> PAGE_SHIFT, cur, + NULL); + if(i == -1){ + printk("setup_highmem - setup_one_range failed\n"); + return; + } + region = regions[i]; + index = phys / PAGE_SIZE; + region->mem_map = &mem_map[index]; + + map = region->mem_map; + for(i = 0; i < (cur >> PAGE_SHIFT); i++){ + page = &map[i]; + ClearPageReserved(page); + set_bit(PG_highmem, &page->flags); + atomic_set(&page->count, 1); + __free_page(page); + } + phys += cur; + len -= cur; + } while(len > 0); +} +#endif + +void paging_init(void) +{ + struct mem_region *region; + unsigned long zones_size[MAX_NR_ZONES], start, end, vaddr; + int i, index; + + empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); + empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); + for(i=0;i<sizeof(zones_size)/sizeof(zones_size[0]);i++) + zones_size[i] = 0; + zones_size[0] = (high_physmem >> PAGE_SHIFT) - + (uml_physmem >> PAGE_SHIFT); + zones_size[2] = highmem >> PAGE_SHIFT; + free_area_init(zones_size); + start = phys_region_index(__pa(uml_physmem)); + end = phys_region_index(__pa(high_physmem - 1)); + for(i = start; i <= end; i++){ + region = regions[i]; + index = (region->start - uml_physmem) / PAGE_SIZE; + region->mem_map = &mem_map[index]; + if(i > start) free_bootmem(__pa(region->start), region->len); + } + + /* + * Fixed mappings, only the page table structure has to be + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; + fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir); + +#ifdef CONFIG_HIGHMEM + init_highmem(); + setup_highmem(highmem); +#endif +} + +pte_t __bad_page(void) +{ + clear_page(empty_bad_page); + return pte_mkdirty(mk_pte((struct page *) empty_bad_page, + PAGE_SHARED)); +} + +/* This can't do anything because nothing in the kernel image can be freed + * since it's not in kernel physical memory. + */ + +void free_initmem(void) +{ +} + +#ifdef CONFIG_BLK_DEV_INITRD + +void free_initrd_mem(unsigned long start, unsigned long end) +{ + if (start < end) + printk ("Freeing initrd memory: %ldk freed\n", + (end - start) >> 10); + for (; start < end; start += PAGE_SIZE) { + ClearPageReserved(virt_to_page(start)); + set_page_count(virt_to_page(start), 1); + free_page(start); + totalram_pages++; + } +} + +#endif + +void show_mem(void) +{ + int pfn, total = 0, reserved = 0; + int shared = 0, cached = 0; + int highmem = 0; + struct page *page; + + printk("Mem-info:\n"); + show_free_areas(); + printk("Free swap: %6dkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + pfn = max_mapnr; + while(pfn-- > 0) { + page = pfn_to_page(pfn); + total++; + if(PageHighMem(page)) + highmem++; + if(PageReserved(page)) + reserved++; + else if(PageSwapCache(page)) + cached++; + else if(page_count(page)) + shared += page_count(page) - 1; + } + printk("%d pages of RAM\n", total); + printk("%d pages of HIGHMEM\n", highmem); + printk("%d reserved pages\n", reserved); + printk("%d pages shared\n", shared); + printk("%d pages swap cached\n", cached); +} + +static int __init uml_mem_setup(char *line, int *add) +{ + char *retptr; + physmem_size = memparse(line,&retptr); + return 0; +} +__uml_setup("mem=", uml_mem_setup, +"mem=<Amount of desired ram>\n" +" This controls how much \"physical\" memory the kernel allocates\n" +" for the system. The size is specified as a number followed by\n" +" one of 'k', 'K', 'm', 'M', which have the obvious meanings.\n" +" This is not related to the amount of memory in the physical\n" +" machine. It can be more, and the excess, if it's ever used, will\n" +" just be swapped out.\n Example: mem=64M\n\n" +); + +struct page *arch_validate(struct page *page, int mask, int order) +{ + unsigned long addr, zero = 0; + int i; + + again: + if(page == NULL) return(page); + if(PageHighMem(page)) return(page); + + addr = (unsigned long) page_address(page); + for(i = 0; i < (1 << order); i++){ + current->thread.fault_addr = (void *) addr; + if(__do_copy_to_user((void *) addr, &zero, + sizeof(zero), + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher)){ + if(!(mask & __GFP_WAIT)) return(NULL); + else break; + } + addr += PAGE_SIZE; + } + if(i == (1 << order)) return(page); + page = alloc_pages(mask, order); + goto again; +} + +DECLARE_MUTEX(vm_reserved_sem); +static struct list_head vm_reserved = LIST_HEAD_INIT(vm_reserved); + +/* Static structures, linked in to the list in early boot */ +static struct vm_reserved head = { + .list = LIST_HEAD_INIT(head.list), + .start = 0, + .end = 0xffffffff +}; + +static struct vm_reserved tail = { + .list = LIST_HEAD_INIT(tail.list), + .start = 0, + .end = 0xffffffff +}; + +void set_usable_vm(unsigned long start, unsigned long end) +{ + list_add(&head.list, &vm_reserved); + list_add(&tail.list, &head.list); + head.end = start; + tail.start = end; +} + +int reserve_vm(unsigned long start, unsigned long end, void *e) + +{ + struct vm_reserved *entry = e, *reserved, *prev; + struct list_head *ele; + int err; + + down(&vm_reserved_sem); + list_for_each(ele, &vm_reserved){ + reserved = list_entry(ele, struct vm_reserved, list); + if(reserved->start >= end) goto found; + } + panic("Reserved vm out of range"); + found: + prev = list_entry(ele->prev, struct vm_reserved, list); + if(prev->end > start) + panic("Can't reserve vm"); + if(entry == NULL) + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if(entry == NULL){ + printk("reserve_vm : Failed to allocate entry\n"); + err = -ENOMEM; + goto out; + } + *entry = ((struct vm_reserved) + { .list = LIST_HEAD_INIT(entry->list), + .start = start, + .end = end }); + list_add(&entry->list, &prev->list); + err = 0; + out: + up(&vm_reserved_sem); + return(0); +} + +unsigned long get_vm(unsigned long len) +{ + struct vm_reserved *this, *next; + struct list_head *ele; + unsigned long start; + int err; + + down(&vm_reserved_sem); + list_for_each(ele, &vm_reserved){ + this = list_entry(ele, struct vm_reserved, list); + next = list_entry(ele->next, struct vm_reserved, list); + if((this->start < next->start) && + (this->end + len + PAGE_SIZE <= next->start)) + goto found; + } + up(&vm_reserved_sem); + return(0); + found: + up(&vm_reserved_sem); + start = (unsigned long) UML_ROUND_UP(this->end) + PAGE_SIZE; + err = reserve_vm(start, start + len, NULL); + if(err) return(0); + return(start); +} + +int nregions(void) +{ + return(NREGIONS); +} + +void setup_range(int fd, char *driver, unsigned long start, unsigned long pfn, + unsigned long len, int need_vm, struct mem_region *region, + void *reserved) +{ + int i, cur; + + do { + cur = min(len, (unsigned long) REGION_SIZE); + i = setup_one_range(fd, driver, start, pfn, cur, region); + region = regions[i]; + if(need_vm && setup_region(region, reserved)){ + kfree(region); + regions[i] = NULL; + return; + } + start += cur; + if(pfn != -1) pfn += cur; + len -= cur; + } while(len > 0); +} + +struct iomem { + char *name; + int fd; + unsigned long size; +}; + +/* iomem regions can only be added on the command line at the moment. + * Locking will be needed when they can be added via mconsole. + */ + +struct iomem iomem_regions[NREGIONS] = { [ 0 ... NREGIONS - 1 ] = + { .name = NULL, + .fd = -1, + .size = 0 } }; + +int num_iomem_regions = 0; + +void add_iomem(char *name, int fd, unsigned long size) +{ + if(num_iomem_regions == sizeof(iomem_regions)/sizeof(iomem_regions[0])) + return; + size = (size + PAGE_SIZE - 1) & PAGE_MASK; + iomem_regions[num_iomem_regions++] = + ((struct iomem) { .name = name, + .fd = fd, + .size = size } ); +} + +int setup_iomem(void) +{ + struct iomem *iomem; + int i; + + for(i = 0; i < num_iomem_regions; i++){ + iomem = &iomem_regions[i]; + setup_range(iomem->fd, iomem->name, -1, -1, iomem->size, 1, + NULL, NULL); + } + return(0); +} + +__initcall(setup_iomem); + +#define PFN_UP(x) (((x) + PAGE_SIZE-1) >> PAGE_SHIFT) +#define PFN_DOWN(x) ((x) >> PAGE_SHIFT) + +/* Changed during early boot */ +static struct mem_region physmem_region; +static struct vm_reserved physmem_reserved; + +void setup_physmem(unsigned long start, unsigned long reserve_end, + unsigned long len) +{ + struct mem_region *region = &physmem_region; + struct vm_reserved *reserved = &physmem_reserved; + unsigned long cur, pfn = 0; + int do_free = 1, bootmap_size; + + do { + cur = min(len, (unsigned long) REGION_SIZE); + if(region == NULL) + region = alloc_bootmem_low_pages(sizeof(*region)); + if(reserved == NULL) + reserved = alloc_bootmem_low_pages(sizeof(*reserved)); + if((region == NULL) || (reserved == NULL)) + panic("Couldn't allocate physmem region or vm " + "reservation\n"); + setup_range(-1, NULL, start, pfn, cur, 1, region, reserved); + + if(do_free){ + unsigned long reserve = reserve_end - start; + int pfn = PFN_UP(__pa(reserve_end)); + int delta = (len - reserve) >> PAGE_SHIFT; + + bootmap_size = init_bootmem(pfn, pfn + delta); + free_bootmem(__pa(reserve_end) + bootmap_size, + cur - bootmap_size - reserve); + do_free = 0; + } + start += cur; + pfn += cur >> PAGE_SHIFT; + len -= cur; + region = NULL; + reserved = NULL; + } while(len > 0); +} + +struct mem_region *phys_region(unsigned long phys) +{ + unsigned int n = phys_region_index(phys); + + if(regions[n] == NULL) + panic("Physical address in uninitialized region"); + return(regions[n]); +} + +unsigned long phys_offset(unsigned long phys) +{ + return(phys_addr(phys)); +} + +struct page *phys_mem_map(unsigned long phys) +{ + return((struct page *) phys_region(phys)->mem_map); +} + +struct page *pte_mem_map(pte_t pte) +{ + return(phys_mem_map(pte_val(pte))); +} + +struct mem_region *page_region(struct page *page, int *index_out) +{ + int i; + struct mem_region *region; + struct page *map; + + for(i = 0; i < NREGIONS; i++){ + region = regions[i]; + if(region == NULL) continue; + map = region->mem_map; + if((page >= map) && (page < &map[region->len >> PAGE_SHIFT])){ + if(index_out != NULL) *index_out = i; + return(region); + } + } + panic("No region found for page"); + return(NULL); +} + +unsigned long page_to_pfn(struct page *page) +{ + struct mem_region *region = page_region(page, NULL); + + return(region->start_pfn + (page - (struct page *) region->mem_map)); +} + +struct mem_region *pfn_to_region(unsigned long pfn, int *index_out) +{ + struct mem_region *region; + int i; + + for(i = 0; i < NREGIONS; i++){ + region = regions[i]; + if(region == NULL) + continue; + + if((region->start_pfn <= pfn) && + (region->start_pfn + (region->len >> PAGE_SHIFT) > pfn)){ + if(index_out != NULL) + *index_out = i; + return(region); + } + } + return(NULL); +} + +struct page *pfn_to_page(unsigned long pfn) +{ + struct mem_region *region = pfn_to_region(pfn, NULL); + struct page *mem_map = (struct page *) region->mem_map; + + return(&mem_map[pfn - region->start_pfn]); +} + +unsigned long phys_to_pfn(unsigned long p) +{ + struct mem_region *region = regions[phys_region_index(p)]; + + return(region->start_pfn + (phys_addr(p) >> PAGE_SHIFT)); +} + +unsigned long pfn_to_phys(unsigned long pfn) +{ + int n; + struct mem_region *region = pfn_to_region(pfn, &n); + + return(mk_phys((pfn - region->start_pfn) << PAGE_SHIFT, n)); +} + +struct page *page_mem_map(struct page *page) +{ + return((struct page *) page_region(page, NULL)->mem_map); +} + +extern unsigned long region_pa(void *virt) +{ + struct mem_region *region; + unsigned long addr = (unsigned long) virt; + int i; + + for(i = 0; i < NREGIONS; i++){ + region = regions[i]; + if(region == NULL) continue; + if((region->start <= addr) && + (addr <= region->start + region->len)) + return(mk_phys(addr - region->start, i)); + } + panic("region_pa : no region for virtual address"); + return(0); +} + +extern void *region_va(unsigned long phys) +{ + return((void *) (phys_region(phys)->start + phys_addr(phys))); +} + +unsigned long page_to_phys(struct page *page) +{ + int n; + struct mem_region *region = page_region(page, &n); + struct page *map = region->mem_map; + return(mk_phys((page - map) << PAGE_SHIFT, n)); +} + +struct page *phys_to_page(unsigned long phys) +{ + struct page *mem_map; + + mem_map = phys_mem_map(phys); + return(mem_map + (phys_offset(phys) >> PAGE_SHIFT)); +} + +static int setup_mem_maps(void) +{ + struct mem_region *region; + int i; + + for(i = 0; i < NREGIONS; i++){ + region = regions[i]; + if((region != NULL) && (region->fd > 0)) init_maps(region); + } + return(0); +} + +__initcall(setup_mem_maps); + +/* + * Allocate and free page tables. + */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); + + if (pgd) { + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); + } + return pgd; +} + +void pgd_free(pgd_t *pgd) +{ + free_page((unsigned long) pgd); +} + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + pte_t *pte; + + pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + if (pte) + clear_page(pte); + return pte; +} + +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); + if (pte) + clear_highpage(pte); + return pte; +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/mem_user.c b/arch/um/kernel/mem_user.c --- a/arch/um/kernel/mem_user.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/mem_user.c Fri Oct 31 14:10:54 2003 @@ -111,6 +111,11 @@ offset = 0; } + if(offset >= region->len){ + printf("%ld bytes of physical memory is insufficient\n", + region->len); + exit(1); + } loc = mmap(start, region->len - offset, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, region->fd, offset); if(loc != start){ @@ -122,26 +127,26 @@ static int __init parse_iomem(char *str, int *add) { - struct stat buf; + struct stat64 buf; char *file, *driver; int fd; driver = str; file = strchr(str,','); if(file == NULL){ - printk("parse_iomem : failed to parse iomem\n"); + printf("parse_iomem : failed to parse iomem\n"); return(1); } *file = '\0'; file++; fd = os_open_file(file, of_rdwr(OPENFLAGS()), 0); if(fd < 0){ - printk("parse_iomem - Couldn't open io file, errno = %d\n", + printf("parse_iomem - Couldn't open io file, errno = %d\n", errno); return(1); } - if(fstat(fd, &buf) < 0) { - printk("parse_iomem - cannot fstat file, errno = %d\n", errno); + if(fstat64(fd, &buf) < 0) { + printf("parse_iomem - cannot fstat file, errno = %d\n", errno); return(1); } add_iomem(driver, fd, buf.st_size); diff -Nru a/arch/um/kernel/mem_user.c~uml-summa.diff b/arch/um/kernel/mem_user.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/mem_user.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,232 @@ +/* + * arch/um/kernel/mem_user.c + * + * BRIEF MODULE DESCRIPTION + * user side memory routines for supporting IO memory inside user mode linux + * + * Copyright (C) 2001 RidgeRun, Inc. + * Author: RidgeRun, Inc. + * Greg Lonnon glonnon@ridgerun.com or info@ridgerun.com + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN + * NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <stdarg.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/mman.h> +#include "kern_util.h" +#include "user.h" +#include "user_util.h" +#include "mem_user.h" +#include "init.h" +#include "os.h" +#include "tempfile.h" + +extern struct mem_region physmem_region; + +#define TEMPNAME_TEMPLATE "vm_file-XXXXXX" + +int create_mem_file(unsigned long len) +{ + int fd; + char zero; + + fd = make_tempfile(TEMPNAME_TEMPLATE, NULL, 1); + if (fchmod(fd, 0777) < 0){ + perror("fchmod"); + exit(1); + } + if(os_seek_file(fd, len) < 0){ + perror("lseek"); + exit(1); + } + zero = 0; + if(write(fd, &zero, 1) != 1){ + perror("write"); + exit(1); + } + if(fcntl(fd, F_SETFD, 1) != 0) + perror("Setting FD_CLOEXEC failed"); + return(fd); +} + +int setup_region(struct mem_region *region, void *entry) +{ + void *loc, *start; + char *driver; + int err, offset; + + if(region->start != -1){ + err = reserve_vm(region->start, + region->start + region->len, entry); + if(err){ + printk("setup_region : failed to reserve " + "0x%x - 0x%x for driver '%s'\n", + region->start, + region->start + region->len, + region->driver); + return(-1); + } + } + else region->start = get_vm(region->len); + if(region->start == 0){ + if(region->driver == NULL) driver = "physmem"; + else driver = region->driver; + printk("setup_region : failed to find vm for " + "driver '%s' (length %d)\n", driver, region->len); + return(-1); + } + if(region->start == uml_physmem){ + start = (void *) uml_reserved; + offset = uml_reserved - uml_physmem; + } + else { + start = (void *) region->start; + offset = 0; + } + + loc = mmap(start, region->len - offset, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, region->fd, offset); + if(loc != start){ + perror("Mapping memory"); + exit(1); + } + return(0); +} + +static int __init parse_iomem(char *str, int *add) +{ + struct stat buf; + char *file, *driver; + int fd; + + driver = str; + file = strchr(str,','); + if(file == NULL){ + printk("parse_iomem : failed to parse iomem\n"); + return(1); + } + *file = '\0'; + file++; + fd = os_open_file(file, of_rdwr(OPENFLAGS()), 0); + if(fd < 0){ + printk("parse_iomem - Couldn't open io file, errno = %d\n", + errno); + return(1); + } + if(fstat(fd, &buf) < 0) { + printk("parse_iomem - cannot fstat file, errno = %d\n", errno); + return(1); + } + add_iomem(driver, fd, buf.st_size); + return(0); +} + +__uml_setup("iomem=", parse_iomem, +"iomem=<name>,<file>\n" +" Configure <file> as an IO memory region named <name>.\n\n" +); + +#ifdef notdef +int logging = 0; +int logging_fd = -1; + +int logging_line = 0; +char logging_buf[256]; + +void log(char *fmt, ...) +{ + va_list ap; + struct timeval tv; + struct openflags flags; + + if(logging == 0) return; + if(logging_fd < 0){ + flags = of_create(of_trunc(of_rdrw(OPENFLAGS()))); + logging_fd = os_open_file("log", flags, 0644); + } + gettimeofday(&tv, NULL); + sprintf(logging_buf, "%d\t %u.%u ", logging_line++, tv.tv_sec, + tv.tv_usec); + va_start(ap, fmt); + vsprintf(&logging_buf[strlen(logging_buf)], fmt, ap); + va_end(ap); + write(logging_fd, logging_buf, strlen(logging_buf)); +} +#endif + +int map_memory(unsigned long virt, unsigned long phys, unsigned long len, + int r, int w, int x) +{ + struct mem_region *region = phys_region(phys); + + return(os_map_memory((void *) virt, region->fd, phys_offset(phys), len, + r, w, x)); +} + +int protect_memory(unsigned long addr, unsigned long len, int r, int w, int x, + int must_succeed) +{ + if(os_protect_memory((void *) addr, len, r, w, x) < 0){ + if(must_succeed) + panic("protect failed, errno = %d", errno); + else return(-errno); + } + return(0); +} + +unsigned long find_iomem(char *driver, unsigned long *len_out) +{ + struct mem_region *region; + int i, n; + + n = nregions(); + for(i = 0; i < n; i++){ + region = regions[i]; + if(region == NULL) continue; + if((region->driver != NULL) && + !strcmp(region->driver, driver)){ + *len_out = region->len; + return(region->start); + } + } + *len_out = 0; + return 0; +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/process.c b/arch/um/kernel/process.c --- a/arch/um/kernel/process.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/process.c Fri Oct 31 14:10:54 2003 @@ -72,7 +72,6 @@ SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); set_handler(SIGUSR2, (__sighandler_t) sig_handler, SA_NOMASK | flags, -1); - (void) CHOOSE_MODE(signal(SIGCHLD, SIG_IGN), (void *) 0); signal(SIGHUP, SIG_IGN); init_irq_signals(altstack); @@ -127,7 +126,8 @@ if(err < 0) panic("Waiting for outer trampoline failed - errno = %d", errno); if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL)) - panic("outer trampoline didn't exit with SIGKILL"); + panic("outer trampoline didn't exit with SIGKILL, " + "status = %d", status); return(arg.pid); } @@ -229,11 +229,11 @@ int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr) { - jmp_buf buf; + sigjmp_buf buf; int n; *jmp_ptr = &buf; - n = setjmp(buf); + n = sigsetjmp(buf,1); if(n != 0) return(n); (*fn)(arg); diff -Nru a/arch/um/kernel/process.c~uml-summa.diff b/arch/um/kernel/process.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/process.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,297 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <sched.h> +#include <errno.h> +#include <stdarg.h> +#include <fcntl.h> +#include <stdlib.h> +#include <setjmp.h> +#include <sys/time.h> +#include <sys/ptrace.h> +#include <sys/ioctl.h> +#include <sys/wait.h> +#include <sys/mman.h> +#include <asm/ptrace.h> +#include <asm/sigcontext.h> +#include <asm/unistd.h> +#include <asm/page.h> +#include "user_util.h" +#include "kern_util.h" +#include "user.h" +#include "process.h" +#include "signal_kern.h" +#include "signal_user.h" +#include "sysdep/ptrace.h" +#include "sysdep/sigcontext.h" +#include "irq_user.h" +#include "ptrace_user.h" +#include "time_user.h" +#include "init.h" +#include "os.h" +#include "uml-config.h" +#include "choose-mode.h" +#include "mode.h" +#ifdef UML_CONFIG_MODE_SKAS +#include "skas.h" +#include "skas_ptrace.h" +#endif + +void init_new_thread_stack(void *sig_stack, void (*usr1_handler)(int)) +{ + int flags = 0, pages; + + if(sig_stack != NULL){ + pages = (1 << UML_CONFIG_KERNEL_STACK_ORDER) - 2; + set_sigstack(sig_stack, pages * page_size()); + flags = SA_ONSTACK; + } + if(usr1_handler) set_handler(SIGUSR1, usr1_handler, flags, -1); +} + +void init_new_thread_signals(int altstack) +{ + int flags = altstack ? SA_ONSTACK : 0; + + set_handler(SIGSEGV, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGTRAP, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGFPE, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGILL, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGBUS, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGWINCH, (__sighandler_t) sig_handler, flags, + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGUSR2, (__sighandler_t) sig_handler, + SA_NOMASK | flags, -1); + (void) CHOOSE_MODE(signal(SIGCHLD, SIG_IGN), (void *) 0); + signal(SIGHUP, SIG_IGN); + + init_irq_signals(altstack); +} + +struct tramp { + int (*tramp)(void *); + void *tramp_data; + unsigned long temp_stack; + int flags; + int pid; +}; + +/* See above for why sigkill is here */ + +int sigkill = SIGKILL; + +int outer_tramp(void *arg) +{ + struct tramp *t; + int sig = sigkill; + + t = arg; + t->pid = clone(t->tramp, (void *) t->temp_stack + page_size()/2, + t->flags, t->tramp_data); + if(t->pid > 0) wait_for_stop(t->pid, SIGSTOP, PTRACE_CONT, NULL); + kill(os_getpid(), sig); + _exit(0); +} + +int start_fork_tramp(void *thread_arg, unsigned long temp_stack, + int clone_flags, int (*tramp)(void *)) +{ + struct tramp arg; + unsigned long sp; + int new_pid, status, err; + + /* The trampoline will run on the temporary stack */ + sp = stack_sp(temp_stack); + + clone_flags |= CLONE_FILES | SIGCHLD; + + arg.tramp = tramp; + arg.tramp_data = thread_arg; + arg.temp_stack = temp_stack; + arg.flags = clone_flags; + + /* Start the process and wait for it to kill itself */ + new_pid = clone(outer_tramp, (void *) sp, clone_flags, &arg); + if(new_pid < 0) return(-errno); + while((err = waitpid(new_pid, &status, 0) < 0) && (errno == EINTR)) ; + if(err < 0) panic("Waiting for outer trampoline failed - errno = %d", + errno); + if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL)) + panic("outer trampoline didn't exit with SIGKILL"); + + return(arg.pid); +} + +void suspend_new_thread(int fd) +{ + char c; + + os_stop_process(os_getpid()); + + if(read(fd, &c, sizeof(c)) != sizeof(c)) + panic("read failed in suspend_new_thread"); +} + +static int ptrace_child(void *arg) +{ + int pid = os_getpid(); + + if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){ + perror("ptrace"); + os_kill_process(pid, 0); + } + os_stop_process(pid); + _exit(os_getpid() == pid); +} + +static int start_ptraced_child(void **stack_out) +{ + void *stack; + unsigned long sp; + int pid, n, status; + + stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if(stack == MAP_FAILED) + panic("check_ptrace : mmap failed, errno = %d", errno); + sp = (unsigned long) stack + PAGE_SIZE - sizeof(void *); + pid = clone(ptrace_child, (void *) sp, SIGCHLD, NULL); + if(pid < 0) + panic("check_ptrace : clone failed, errno = %d", errno); + n = waitpid(pid, &status, WUNTRACED); + if(n < 0) + panic("check_ptrace : wait failed, errno = %d", errno); + if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) + panic("check_ptrace : expected SIGSTOP, got status = %d", + status); + + *stack_out = stack; + return(pid); +} + +static void stop_ptraced_child(int pid, void *stack, int exitcode) +{ + int status, n; + + if(ptrace(PTRACE_CONT, pid, 0, 0) < 0) + panic("check_ptrace : ptrace failed, errno = %d", errno); + n = waitpid(pid, &status, 0); + if(!WIFEXITED(status) || (WEXITSTATUS(status) != exitcode)) + panic("check_ptrace : child exited with status 0x%x", status); + + if(munmap(stack, PAGE_SIZE) < 0) + panic("check_ptrace : munmap failed, errno = %d", errno); +} + +void __init check_ptrace(void) +{ + void *stack; + int pid, syscall, n, status; + + printk("Checking that ptrace can change system call numbers..."); + pid = start_ptraced_child(&stack); + + while(1){ + if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) + panic("check_ptrace : ptrace failed, errno = %d", + errno); + n = waitpid(pid, &status, WUNTRACED); + if(n < 0) + panic("check_ptrace : wait failed, errno = %d", errno); + if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) + panic("check_ptrace : expected SIGTRAP, " + "got status = %d", status); + + syscall = ptrace(PTRACE_PEEKUSER, pid, PT_SYSCALL_NR_OFFSET, + 0); + if(syscall == __NR_getpid){ + n = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, + __NR_getppid); + if(n < 0) + panic("check_ptrace : failed to modify system " + "call, errno = %d", errno); + break; + } + } + stop_ptraced_child(pid, stack, 0); + printk("OK\n"); +} + +int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr) +{ + jmp_buf buf; + int n; + + *jmp_ptr = &buf; + n = setjmp(buf); + if(n != 0) + return(n); + (*fn)(arg); + return(0); +} + +void forward_pending_sigio(int target) +{ + sigset_t sigs; + + if(sigpending(&sigs)) + panic("forward_pending_sigio : sigpending failed"); + if(sigismember(&sigs, SIGIO)) + kill(target, SIGIO); +} + +int can_do_skas(void) +{ +#ifdef UML_CONFIG_MODE_SKAS + struct ptrace_faultinfo fi; + void *stack; + int pid, n, ret = 1; + + printf("Checking for the skas3 patch in the host..."); + pid = start_ptraced_child(&stack); + + n = ptrace(PTRACE_FAULTINFO, pid, 0, &fi); + if(n < 0){ + if(errno == EIO) + printf("not found\n"); + else printf("No (unexpected errno - %d)\n", errno); + ret = 0; + } + else printf("found\n"); + + init_registers(pid); + stop_ptraced_child(pid, stack, 1); + + printf("Checking for /proc/mm..."); + if(access("/proc/mm", W_OK)){ + printf("not found\n"); + ret = 0; + } + else printf("found\n"); + + return(ret); +#else + return(0); +#endif +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c --- a/arch/um/kernel/process_kern.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/process_kern.c Fri Oct 31 14:10:54 2003 @@ -26,6 +26,7 @@ #include "asm/spinlock.h" #include "asm/uaccess.h" #include "asm/user.h" +#include "asm/io.h" #include "user_util.h" #include "kern_util.h" #include "kern.h" @@ -52,17 +53,12 @@ struct task_struct *get_task(int pid, int require) { - struct task_struct *task, *ret; + struct task_struct *ret; - ret = NULL; read_lock(&tasklist_lock); - for_each_process(task){ - if(task->pid == pid){ - ret = task; - break; - } - } + ret = find_task_by_pid(pid); read_unlock(&tasklist_lock); + if(require && (ret == NULL)) panic("get_task couldn't find a task\n"); return(ret); } @@ -103,13 +99,14 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { - struct task_struct *p; + int pid; current->thread.request.u.thread.proc = fn; current->thread.request.u.thread.arg = arg; - p = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL); - if(IS_ERR(p)) panic("do_fork failed in kernel_thread"); - return(p->pid); + pid = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL); + if(pid < 0) + panic("do_fork failed in kernel_thread, errno = %d", pid); + return(pid); } void switch_mm(struct mm_struct *prev, struct mm_struct *next, @@ -129,7 +126,7 @@ { external_pid(task), task }); } -void *switch_to(void *prev, void *next, void *last) +void *_switch_to(void *prev, void *next, void *last) { return(CHOOSE_MODE(switch_to_tt(prev, next), switch_to_skas(prev, next))); @@ -149,7 +146,7 @@ void exit_thread(void) { CHOOSE_MODE(exit_thread_tt(), exit_thread_skas()); - unprotect_stack((unsigned long) current->thread_info); + unprotect_stack((unsigned long) current_thread); } void *get_current(void) @@ -157,6 +154,10 @@ return(current); } +void prepare_to_copy(struct task_struct *tsk) +{ +} + int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long stack_top, struct task_struct * p, struct pt_regs *regs) @@ -190,7 +191,7 @@ void default_idle(void) { - idle_timer(); + uml_idle_timer(); atomic_inc(&init_mm.mm_count); current->mm = &init_mm; @@ -251,11 +252,12 @@ char *current_cmd(void) { #if defined(CONFIG_SMP) || defined(CONFIG_HIGHMEM) - return("(Unknown)"); + void *addr = virt_to_phys( current->comm); #else void *addr = um_virt_to_phys(current, current->mm->arg_start, NULL); return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr); #endif + return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr); } void force_sigbus(void) @@ -367,10 +369,15 @@ return(clear_user(buf, size)); } +int strlen_user_proc(char *str) +{ + return(strlen_user(str)); +} + int smp_sigio_handler(void) { #ifdef CONFIG_SMP - int cpu = current->thread_info->cpu; + int cpu = current_thread->cpu; IPI_handler(cpu); if(cpu != 0) return(1); @@ -385,7 +392,7 @@ int cpu(void) { - return(current->thread_info->cpu); + return(current_thread->cpu); } /* diff -Nru a/arch/um/kernel/process_kern.c~uml-summa.diff b/arch/um/kernel/process_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/process_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,400 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/kernel.h" +#include "linux/sched.h" +#include "linux/interrupt.h" +#include "linux/mm.h" +#include "linux/slab.h" +#include "linux/utsname.h" +#include "linux/fs.h" +#include "linux/utime.h" +#include "linux/smp_lock.h" +#include "linux/module.h" +#include "linux/init.h" +#include "linux/capability.h" +#include "asm/unistd.h" +#include "asm/mman.h" +#include "asm/segment.h" +#include "asm/stat.h" +#include "asm/pgtable.h" +#include "asm/processor.h" +#include "asm/tlbflush.h" +#include "asm/spinlock.h" +#include "asm/uaccess.h" +#include "asm/user.h" +#include "user_util.h" +#include "kern_util.h" +#include "kern.h" +#include "signal_kern.h" +#include "signal_user.h" +#include "init.h" +#include "irq_user.h" +#include "mem_user.h" +#include "time_user.h" +#include "tlb.h" +#include "frame_kern.h" +#include "sigcontext.h" +#include "2_5compat.h" +#include "os.h" +#include "mode.h" +#include "mode_kern.h" +#include "choose-mode.h" + +/* This is a per-cpu array. A processor only modifies its entry and it only + * cares about its entry, so it's OK if another processor is modifying its + * entry. + */ +struct cpu_task cpu_tasks[NR_CPUS] = { [0 ... NR_CPUS - 1] = { -1, NULL } }; + +struct task_struct *get_task(int pid, int require) +{ + struct task_struct *task, *ret; + + ret = NULL; + read_lock(&tasklist_lock); + for_each_process(task){ + if(task->pid == pid){ + ret = task; + break; + } + } + read_unlock(&tasklist_lock); + if(require && (ret == NULL)) panic("get_task couldn't find a task\n"); + return(ret); +} + +int external_pid(void *t) +{ + struct task_struct *task = t ? t : current; + + return(CHOOSE_MODE_PROC(external_pid_tt, external_pid_skas, task)); +} + +int pid_to_processor_id(int pid) +{ + int i; + + for(i = 0; i < ncpus; i++){ + if(cpu_tasks[i].pid == pid) return(i); + } + return(-1); +} + +void free_stack(unsigned long stack, int order) +{ + free_pages(stack, order); +} + +unsigned long alloc_stack(int order, int atomic) +{ + unsigned long page; + int flags = GFP_KERNEL; + + if(atomic) flags |= GFP_ATOMIC; + if((page = __get_free_pages(flags, order)) == 0) + return(0); + stack_protections(page); + return(page); +} + +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) +{ + struct task_struct *p; + + current->thread.request.u.thread.proc = fn; + current->thread.request.u.thread.arg = arg; + p = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL); + if(IS_ERR(p)) panic("do_fork failed in kernel_thread"); + return(p->pid); +} + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + if (prev != next) + clear_bit(cpu, &prev->cpu_vm_mask); + set_bit(cpu, &next->cpu_vm_mask); +} + +void set_current(void *t) +{ + struct task_struct *task = t; + + cpu_tasks[task->thread_info->cpu] = ((struct cpu_task) + { external_pid(task), task }); +} + +void *switch_to(void *prev, void *next, void *last) +{ + return(CHOOSE_MODE(switch_to_tt(prev, next), + switch_to_skas(prev, next))); +} + +void interrupt_end(void) +{ + if(need_resched()) schedule(); + if(test_tsk_thread_flag(current, TIF_SIGPENDING)) do_signal(0); +} + +void release_thread(struct task_struct *task) +{ + CHOOSE_MODE(release_thread_tt(task), release_thread_skas(task)); +} + +void exit_thread(void) +{ + CHOOSE_MODE(exit_thread_tt(), exit_thread_skas()); + unprotect_stack((unsigned long) current->thread_info); +} + +void *get_current(void) +{ + return(current); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, + unsigned long stack_top, struct task_struct * p, + struct pt_regs *regs) +{ + p->thread = (struct thread_struct) INIT_THREAD; + p->thread.kernel_stack = + (unsigned long) p->thread_info + 2 * PAGE_SIZE; + return(CHOOSE_MODE_PROC(copy_thread_tt, copy_thread_skas, nr, + clone_flags, sp, stack_top, p, regs)); +} + +void initial_thread_cb(void (*proc)(void *), void *arg) +{ + int save_kmalloc_ok = kmalloc_ok; + + kmalloc_ok = 0; + CHOOSE_MODE_PROC(initial_thread_cb_tt, initial_thread_cb_skas, proc, + arg); + kmalloc_ok = save_kmalloc_ok; +} + +unsigned long stack_sp(unsigned long page) +{ + return(page + PAGE_SIZE - sizeof(void *)); +} + +int current_pid(void) +{ + return(current->pid); +} + +void default_idle(void) +{ + idle_timer(); + + atomic_inc(&init_mm.mm_count); + current->mm = &init_mm; + current->active_mm = &init_mm; + + while(1){ + /* endless idle loop with no priority at all */ + SET_PRI(current); + + /* + * although we are an idle CPU, we do not want to + * get into the scheduler unnecessarily. + */ + irq_stat[smp_processor_id()].idle_timestamp = jiffies; + if(need_resched()) + schedule(); + + idle_sleep(10); + } +} + +void cpu_idle(void) +{ + CHOOSE_MODE(init_idle_tt(), init_idle_skas()); +} + +int page_size(void) +{ + return(PAGE_SIZE); +} + +int page_mask(void) +{ + return(PAGE_MASK); +} + +void *um_virt_to_phys(struct task_struct *task, unsigned long addr, + pte_t *pte_out) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + if(task->mm == NULL) + return(ERR_PTR(-EINVAL)); + pgd = pgd_offset(task->mm, addr); + pmd = pmd_offset(pgd, addr); + if(!pmd_present(*pmd)) + return(ERR_PTR(-EINVAL)); + pte = pte_offset_kernel(pmd, addr); + if(!pte_present(*pte)) + return(ERR_PTR(-EINVAL)); + if(pte_out != NULL) + *pte_out = *pte; + return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK)); +} + +char *current_cmd(void) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_HIGHMEM) + return("(Unknown)"); +#else + void *addr = um_virt_to_phys(current, current->mm->arg_start, NULL); + return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr); +#endif +} + +void force_sigbus(void) +{ + printk(KERN_ERR "Killing pid %d because of a lack of memory\n", + current->pid); + lock_kernel(); + sigaddset(¤t->pending.signal, SIGBUS); + recalc_sigpending(); + current->flags |= PF_SIGNALED; + do_exit(SIGBUS | 0x80); +} + +void dump_thread(struct pt_regs *regs, struct user *u) +{ +} + +void enable_hlt(void) +{ + panic("enable_hlt"); +} + +EXPORT_SYMBOL(enable_hlt); + +void disable_hlt(void) +{ + panic("disable_hlt"); +} + +EXPORT_SYMBOL(disable_hlt); + +extern int signal_frame_size; + +void *um_kmalloc(int size) +{ + return(kmalloc(size, GFP_KERNEL)); +} + +void *um_kmalloc_atomic(int size) +{ + return(kmalloc(size, GFP_ATOMIC)); +} + +unsigned long get_fault_addr(void) +{ + return((unsigned long) current->thread.fault_addr); +} + +EXPORT_SYMBOL(get_fault_addr); + +void not_implemented(void) +{ + printk(KERN_DEBUG "Something isn't implemented in here\n"); +} + +EXPORT_SYMBOL(not_implemented); + +int user_context(unsigned long sp) +{ + unsigned long stack; + + stack = sp & (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER); + stack += 2 * PAGE_SIZE; + return(stack != current->thread.kernel_stack); +} + +extern void remove_umid_dir(void); + +__uml_exitcall(remove_umid_dir); + +extern exitcall_t __uml_exitcall_begin, __uml_exitcall_end; + +void do_uml_exitcalls(void) +{ + exitcall_t *call; + + call = &__uml_exitcall_end; + while (--call >= &__uml_exitcall_begin) + (*call)(); +} + +char *uml_strdup(char *string) +{ + char *new; + + new = kmalloc(strlen(string) + 1, GFP_KERNEL); + if(new == NULL) return(NULL); + strcpy(new, string); + return(new); +} + +void *get_init_task(void) +{ + return(&init_thread_union.thread_info.task); +} + +int copy_to_user_proc(void *to, void *from, int size) +{ + return(copy_to_user(to, from, size)); +} + +int copy_from_user_proc(void *to, void *from, int size) +{ + return(copy_from_user(to, from, size)); +} + +int clear_user_proc(void *buf, int size) +{ + return(clear_user(buf, size)); +} + +int smp_sigio_handler(void) +{ +#ifdef CONFIG_SMP + int cpu = current->thread_info->cpu; + IPI_handler(cpu); + if(cpu != 0) + return(1); +#endif + return(0); +} + +int um_in_interrupt(void) +{ + return(in_interrupt()); +} + +int cpu(void) +{ + return(current->thread_info->cpu); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c --- a/arch/um/kernel/ptrace.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/ptrace.c Fri Oct 31 14:10:53 2003 @@ -311,11 +311,8 @@ /* the 0x80 provides a way for the tracing parent to distinguish between a syscall stop and SIGTRAP delivery */ - current->exit_code = SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0); - current->state = TASK_STOPPED; - notify_parent(current, SIGCHLD); - schedule(); + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0)); /* * this isn't the same as continuing with a signal, but it will do diff -Nru a/arch/um/kernel/ptrace.c~uml-summa.diff b/arch/um/kernel/ptrace.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/ptrace.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,340 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/mm.h" +#include "linux/errno.h" +#include "linux/smp_lock.h" +#include "linux/security.h" +#include "linux/ptrace.h" +#ifdef CONFIG_PROC_MM +#include "linux/proc_mm.h" +#endif +#include "asm/ptrace.h" +#include "asm/uaccess.h" +#include "kern_util.h" +#include "ptrace_user.h" + +/* + * Called by kernel/ptrace.c when detaching.. + */ +void ptrace_disable(struct task_struct *child) +{ +} + +extern long do_mmap2(struct task_struct *task, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flags, unsigned long fd, + unsigned long pgoff); + +int sys_ptrace(long request, long pid, long addr, long data) +{ + struct task_struct *child; + int i, ret; + + lock_kernel(); + ret = -EPERM; + if (request == PTRACE_TRACEME) { + /* are we already being traced? */ + if (current->ptrace & PT_PTRACED) + goto out; + + ret = security_ptrace(current->parent, current); + if (ret) + goto out; + + /* set the ptrace bit in the process flags. */ + current->ptrace |= PT_PTRACED; + ret = 0; + goto out; + } + ret = -ESRCH; + read_lock(&tasklist_lock); + child = find_task_by_pid(pid); + if (child) + get_task_struct(child); + read_unlock(&tasklist_lock); + if (!child) + goto out; + + ret = -EPERM; + if (pid == 1) /* you may not mess with init */ + goto out_tsk; + + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + goto out_tsk; + } + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out_tsk; + + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ + case PTRACE_PEEKDATA: { + unsigned long tmp; + int copied; + + ret = -EIO; + copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); + if (copied != sizeof(tmp)) + break; + ret = put_user(tmp,(unsigned long *) data); + break; + } + + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & 3) || addr < 0) + break; + + tmp = 0; /* Default return condition */ + if(addr < FRAME_SIZE_OFFSET){ + tmp = getreg(child, addr); + } + else if((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))){ + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + tmp = child->thread.arch.debugregs[addr]; + } + ret = put_user(tmp, (unsigned long *) data); + break; + } + + /* when I and D space are separate, this will have to be fixed. */ + case PTRACE_POKETEXT: /* write the word at location addr. */ + case PTRACE_POKEDATA: + ret = -EIO; + if (access_process_vm(child, addr, &data, sizeof(data), + 1) != sizeof(data)) + break; + ret = 0; + break; + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & 3) || addr < 0) + break; + + if (addr < FRAME_SIZE_OFFSET) { + ret = putreg(child, addr, data); + break; + } + else if((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))){ + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + if((addr == 4) || (addr == 5)) break; + child->thread.arch.debugregs[addr] = data; + ret = 0; + } + + break; + + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: { /* restart after signal. */ + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; + if (request == PTRACE_SYSCALL) { + set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + } + else { + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + } + child->exit_code = data; + wake_up_process(child); + ret = 0; + break; + } + +/* + * make the child exit. Best I can do is send it a sigkill. + * perhaps it should be put in the status that it wants to + * exit. + */ + case PTRACE_KILL: { + ret = 0; + if (child->state == TASK_ZOMBIE) /* already dead */ + break; + child->exit_code = SIGKILL; + wake_up_process(child); + break; + } + + case PTRACE_SINGLESTEP: { /* set the trap flag. */ + ret = -EIO; + if ((unsigned long) data > _NSIG) + break; + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + child->ptrace |= PT_DTRACE; + child->exit_code = data; + /* give it a chance to run. */ + wake_up_process(child); + ret = 0; + break; + } + + case PTRACE_DETACH: + /* detach a process that was attached. */ + ret = ptrace_detach(child, data); + break; + +#ifdef PTRACE_GETREGS + case PTRACE_GETREGS: { /* Get all gp regs from the child. */ + if (!access_ok(VERIFY_WRITE, (unsigned long *)data, + FRAME_SIZE_OFFSET)) { + ret = -EIO; + break; + } + for ( i = 0; i < FRAME_SIZE_OFFSET; i += sizeof(long) ) { + __put_user(getreg(child, i), (unsigned long *) data); + data += sizeof(long); + } + ret = 0; + break; + } +#endif +#ifdef PTRACE_SETREGS + case PTRACE_SETREGS: { /* Set all gp regs in the child. */ + unsigned long tmp = 0; + if (!access_ok(VERIFY_READ, (unsigned *)data, + FRAME_SIZE_OFFSET)) { + ret = -EIO; + break; + } + for ( i = 0; i < FRAME_SIZE_OFFSET; i += sizeof(long) ) { + __get_user(tmp, (unsigned long *) data); + putreg(child, i, tmp); + data += sizeof(long); + } + ret = 0; + break; + } +#endif +#ifdef PTRACE_GETFPREGS + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + ret = get_fpregs(data, child); + break; +#endif +#ifdef PTRACE_SETFPREGS + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + ret = set_fpregs(data, child); + break; +#endif +#ifdef PTRACE_GETFPXREGS + case PTRACE_GETFPXREGS: /* Get the child FPU state. */ + ret = get_fpxregs(data, child); + break; +#endif +#ifdef PTRACE_SETFPXREGS + case PTRACE_SETFPXREGS: /* Set the child FPU state. */ + ret = set_fpxregs(data, child); + break; +#endif + case PTRACE_FAULTINFO: { + struct ptrace_faultinfo fault; + + fault = ((struct ptrace_faultinfo) + { .is_write = child->thread.err, + .addr = child->thread.cr2 }); + ret = copy_to_user((unsigned long *) data, &fault, + sizeof(fault)); + if(ret) + break; + break; + } + case PTRACE_SIGPENDING: + ret = copy_to_user((unsigned long *) data, + &child->pending.signal, + sizeof(child->pending.signal)); + break; + + case PTRACE_LDT: { + struct ptrace_ldt ldt; + + if(copy_from_user(&ldt, (unsigned long *) data, + sizeof(ldt))){ + ret = -EIO; + break; + } + + /* This one is confusing, so just punt and return -EIO for + * now + */ + ret = -EIO; + break; + } +#ifdef CONFIG_PROC_MM + case PTRACE_SWITCH_MM: { + struct mm_struct *old = child->mm; + struct mm_struct *new = proc_mm_get_mm(data); + + if(IS_ERR(new)){ + ret = PTR_ERR(new); + break; + } + + atomic_inc(&new->mm_users); + child->mm = new; + child->active_mm = new; + mmput(old); + ret = 0; + break; + } +#endif + default: + ret = -EIO; + break; + } + out_tsk: + put_task_struct(child); + out: + unlock_kernel(); + return ret; +} + +void syscall_trace(void) +{ + if (!test_thread_flag(TIF_SYSCALL_TRACE)) + return; + if (!(current->ptrace & PT_PTRACED)) + return; + + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + current->exit_code = SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0); + current->state = TASK_STOPPED; + notify_parent(current, SIGCHLD); + schedule(); + + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/sigio_kern.c b/arch/um/kernel/sigio_kern.c --- a/arch/um/kernel/sigio_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/sigio_kern.c Fri Oct 31 14:10:53 2003 @@ -6,7 +6,8 @@ #include "linux/kernel.h" #include "linux/list.h" #include "linux/slab.h" -#include "asm/irq.h" +#include "linux/signal.h" +#include "linux/interrupt.h" #include "init.h" #include "sigio.h" #include "irq_user.h" @@ -14,10 +15,11 @@ /* Protected by sigio_lock() called from write_sigio_workaround */ static int sigio_irq_fd = -1; -void sigio_interrupt(int irq, void *data, struct pt_regs *unused) +irqreturn_t sigio_interrupt(int irq, void *data, struct pt_regs *unused) { read_sigio_fd(sigio_irq_fd); reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); + return(IRQ_HANDLED); } int write_sigio_irq(int fd) diff -Nru a/arch/um/kernel/sigio_kern.c~uml-kill-irq_kern.h.diff b/arch/um/kernel/sigio_kern.c~uml-kill-irq_kern.h.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/sigio_kern.c~uml-kill-irq_kern.h.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/list.h" +#include "linux/slab.h" +#include "linux/signal.h" +#include "linux/interrupt.h" +#include "init.h" +#include "sigio.h" +#include "irq_user.h" +#include "irq_kern.h" + +/* Protected by sigio_lock() called from write_sigio_workaround */ +static int sigio_irq_fd = -1; + +irqreturn_t sigio_interrupt(int irq, void *data, struct pt_regs *unused) +{ + read_sigio_fd(sigio_irq_fd); + reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); + return(IRQ_HANDLED); +} + +int write_sigio_irq(int fd) +{ + if(um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, + SA_INTERRUPT | SA_SAMPLE_RANDOM, "write sigio", + NULL)){ + printk("write_sigio_irq : um_request_irq failed\n"); + return(-1); + } + sigio_irq_fd = fd; + return(0); +} + +static spinlock_t sigio_spinlock = SPIN_LOCK_UNLOCKED; + +void sigio_lock(void) +{ + spin_lock(&sigio_spinlock); +} + +void sigio_unlock(void) +{ + spin_unlock(&sigio_spinlock); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/sigio_kern.c~uml-summa.diff b/arch/um/kernel/sigio_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/sigio_kern.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/list.h" +#include "linux/slab.h" +#include "asm/irq.h" +#include "init.h" +#include "sigio.h" +#include "irq_user.h" + +/* Protected by sigio_lock() called from write_sigio_workaround */ +static int sigio_irq_fd = -1; + +void sigio_interrupt(int irq, void *data, struct pt_regs *unused) +{ + read_sigio_fd(sigio_irq_fd); + reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); +} + +int write_sigio_irq(int fd) +{ + if(um_request_irq(SIGIO_WRITE_IRQ, fd, IRQ_READ, sigio_interrupt, + SA_INTERRUPT | SA_SAMPLE_RANDOM, "write sigio", + NULL)){ + printk("write_sigio_irq : um_request_irq failed\n"); + return(-1); + } + sigio_irq_fd = fd; + return(0); +} + +static spinlock_t sigio_spinlock = SPIN_LOCK_UNLOCKED; + +void sigio_lock(void) +{ + spin_lock(&sigio_spinlock); +} + +void sigio_unlock(void) +{ + spin_unlock(&sigio_spinlock); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/signal_kern.c b/arch/um/kernel/signal_kern.c --- a/arch/um/kernel/signal_kern.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/signal_kern.c Fri Oct 31 14:10:54 2003 @@ -36,7 +36,7 @@ if(sig == SIGSEGV){ struct k_sigaction *ka; - ka = ¤t->sig->action[SIGSEGV - 1]; + ka = ¤t->sighand->action[SIGSEGV - 1]; ka->sa.sa_handler = SIG_DFL; } force_sig(SIGSEGV, current); @@ -142,7 +142,7 @@ return(0); /* Whee! Actually deliver the signal. */ - ka = ¤t->sig->action[sig -1 ]; + ka = ¤t->sighand->action[sig -1 ]; err = handle_signal(regs, sig, ka, &info, oldset, error); if(!err) return(1); @@ -201,7 +201,7 @@ } } -int sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize) +int sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) { sigset_t saveset, newset; @@ -227,6 +227,42 @@ } } +int sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (verify_area(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} + +int sys_sigaltstack(const stack_t *uss, stack_t *uoss) +{ + return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); +} + static int copy_sc_from_user(struct pt_regs *to, void *from, struct arch_frame_data *arch) { @@ -239,8 +275,8 @@ int sys_sigreturn(struct pt_regs regs) { - void *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); - void *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); + void __user *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); + void __user *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); spin_lock_irq(¤t->sighand->siglock); @@ -257,7 +293,8 @@ int sys_rt_sigreturn(struct pt_regs regs) { - struct ucontext *uc = sp_to_uc(PT_REGS_SP(¤t->thread.regs)); + unsigned long sp = PT_REGS_SP(¤t->thread.regs); + struct ucontext __user *uc = sp_to_uc(sp); void *fp; int sig_size = _NSIG_WORDS * sizeof(unsigned long); diff -Nru a/arch/um/kernel/signal_kern.c~uml-summa.diff b/arch/um/kernel/signal_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/signal_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,284 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/stddef.h" +#include "linux/sys.h" +#include "linux/sched.h" +#include "linux/wait.h" +#include "linux/kernel.h" +#include "linux/smp_lock.h" +#include "linux/module.h" +#include "linux/slab.h" +#include "linux/tty.h" +#include "linux/binfmts.h" +#include "linux/ptrace.h" +#include "asm/signal.h" +#include "asm/uaccess.h" +#include "asm/unistd.h" +#include "user_util.h" +#include "asm/ucontext.h" +#include "kern_util.h" +#include "signal_kern.h" +#include "signal_user.h" +#include "kern.h" +#include "frame_kern.h" +#include "sigcontext.h" +#include "mode.h" + +EXPORT_SYMBOL(block_signals); +EXPORT_SYMBOL(unblock_signals); + +static void force_segv(int sig) +{ + if(sig == SIGSEGV){ + struct k_sigaction *ka; + + ka = ¤t->sig->action[SIGSEGV - 1]; + ka->sa.sa_handler = SIG_DFL; + } + force_sig(SIGSEGV, current); +} + +#define _S(nr) (1<<((nr)-1)) + +#define _BLOCKABLE (~(_S(SIGKILL) | _S(SIGSTOP))) + +/* + * OK, we're invoking a handler + */ +static int handle_signal(struct pt_regs *regs, unsigned long signr, + struct k_sigaction *ka, siginfo_t *info, + sigset_t *oldset, int error) +{ + __sighandler_t handler; + void (*restorer)(void); + unsigned long sp; + sigset_t save; + int err, ret; + + ret = 0; + switch(error){ + case -ERESTART_RESTARTBLOCK: + current_thread_info()->restart_block.fn = + do_no_restart_syscall; + case -ERESTARTNOHAND: + ret = -EINTR; + break; + + case -ERESTARTSYS: + if (!(ka->sa.sa_flags & SA_RESTART)) { + ret = -EINTR; + break; + } + /* fallthrough */ + case -ERESTARTNOINTR: + PT_REGS_RESTART_SYSCALL(regs); + PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs); + + /* This is because of the UM_SET_SYSCALL_RETURN and the fact + * that on i386 the system call number and return value are + * in the same register. When the system call restarts, %eax + * had better have the system call number in it. Since the + * return value doesn't matter (except that it shouldn't be + * -ERESTART*), we'll stick the system call number there. + */ + ret = PT_REGS_SYSCALL_NR(regs); + break; + } + + handler = ka->sa.sa_handler; + save = *oldset; + + if (ka->sa.sa_flags & SA_ONESHOT) + ka->sa.sa_handler = SIG_DFL; + + if (!(ka->sa.sa_flags & SA_NODEFER)) { + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked, ¤t->blocked, + &ka->sa.sa_mask); + sigaddset(¤t->blocked, signr); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + } + + sp = PT_REGS_SP(regs); + + if((ka->sa.sa_flags & SA_ONSTACK) && (sas_ss_flags(sp) == 0)) + sp = current->sas_ss_sp + current->sas_ss_size; + + if(error != 0) PT_REGS_SET_SYSCALL_RETURN(regs, ret); + + if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; + else restorer = NULL; + + if(ka->sa.sa_flags & SA_SIGINFO) + err = setup_signal_stack_si(sp, signr, (unsigned long) handler, + restorer, regs, info, &save); + else + err = setup_signal_stack_sc(sp, signr, (unsigned long) handler, + restorer, regs, &save); + if(err) goto segv; + + return(0); + segv: + force_segv(signr); + return(1); +} + +static int kern_do_signal(struct pt_regs *regs, sigset_t *oldset, int error) +{ + siginfo_t info; + struct k_sigaction *ka; + int err, sig; + + if (!oldset) + oldset = ¤t->blocked; + + sig = get_signal_to_deliver(&info, regs, NULL); + if(sig == 0) + return(0); + + /* Whee! Actually deliver the signal. */ + ka = ¤t->sig->action[sig -1 ]; + err = handle_signal(regs, sig, ka, &info, oldset, error); + if(!err) return(1); + + /* Did we come from a system call? */ + if(PT_REGS_SYSCALL_NR(regs) >= 0){ + /* Restart the system call - no handlers present */ + if(PT_REGS_SYSCALL_RET(regs) == -ERESTARTNOHAND || + PT_REGS_SYSCALL_RET(regs) == -ERESTARTSYS || + PT_REGS_SYSCALL_RET(regs) == -ERESTARTNOINTR){ + PT_REGS_ORIG_SYSCALL(regs) = PT_REGS_SYSCALL_NR(regs); + PT_REGS_RESTART_SYSCALL(regs); + } + else if(PT_REGS_SYSCALL_RET(regs) == -ERESTART_RESTARTBLOCK){ + PT_REGS_SYSCALL_RET(regs) = __NR_restart_syscall; + PT_REGS_RESTART_SYSCALL(regs); + } + } + + /* This closes a way to execute a system call on the host. If + * you set a breakpoint on a system call instruction and singlestep + * from it, the tracing thread used to PTRACE_SINGLESTEP the process + * rather than PTRACE_SYSCALL it, allowing the system call to execute + * on the host. The tracing thread will check this flag and + * PTRACE_SYSCALL if necessary. + */ + if((current->ptrace & PT_DTRACE) && + is_syscall(PT_REGS_IP(¤t->thread.regs))) + (void) CHOOSE_MODE(current->thread.mode.tt.singlestep_syscall = 1, 0); + return(0); +} + +int do_signal(int error) +{ + return(kern_do_signal(¤t->thread.regs, NULL, error)); +} + +/* + * Atomically swap in the new signal mask, and wait for a signal. + */ +int sys_sigsuspend(int history0, int history1, old_sigset_t mask) +{ + sigset_t saveset; + + mask &= _BLOCKABLE; + spin_lock_irq(¤t->sighand->siglock); + saveset = current->blocked; + siginitset(¤t->blocked, mask); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if(kern_do_signal(¤t->thread.regs, &saveset, -EINTR)) + return(-EINTR); + } +} + +int sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize) +{ + sigset_t saveset, newset; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset, unewset, sizeof(newset))) + return -EFAULT; + sigdelsetmask(&newset, ~_BLOCKABLE); + + spin_lock_irq(¤t->sighand->siglock); + saveset = current->blocked; + current->blocked = newset; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + while (1) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + if (kern_do_signal(¤t->thread.regs, &saveset, -EINTR)) + return(-EINTR); + } +} + +static int copy_sc_from_user(struct pt_regs *to, void *from, + struct arch_frame_data *arch) +{ + int ret; + + ret = CHOOSE_MODE(copy_sc_from_user_tt(UPT_SC(&to->regs), from, arch), + copy_sc_from_user_skas(&to->regs, from)); + return(ret); +} + +int sys_sigreturn(struct pt_regs regs) +{ + void *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); + void *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); + int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); + + spin_lock_irq(¤t->sighand->siglock); + copy_from_user(¤t->blocked.sig[0], sc_sigmask(sc), + sizeof(current->blocked.sig[0])); + copy_from_user(¤t->blocked.sig[1], mask, sig_size); + sigdelsetmask(¤t->blocked, ~_BLOCKABLE); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + copy_sc_from_user(¤t->thread.regs, sc, + &signal_frame_sc.common.arch); + return(PT_REGS_SYSCALL_RET(¤t->thread.regs)); +} + +int sys_rt_sigreturn(struct pt_regs regs) +{ + struct ucontext *uc = sp_to_uc(PT_REGS_SP(¤t->thread.regs)); + void *fp; + int sig_size = _NSIG_WORDS * sizeof(unsigned long); + + spin_lock_irq(¤t->sighand->siglock); + copy_from_user(¤t->blocked, &uc->uc_sigmask, sig_size); + sigdelsetmask(¤t->blocked, ~_BLOCKABLE); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + fp = (void *) (((unsigned long) uc) + sizeof(struct ucontext)); + copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext, + &signal_frame_si.common.arch); + return(PT_REGS_SYSCALL_RET(¤t->thread.regs)); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/skas/Makefile b/arch/um/kernel/skas/Makefile --- a/arch/um/kernel/skas/Makefile Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/skas/Makefile Fri Oct 31 14:10:53 2003 @@ -7,18 +7,22 @@ process_kern.o syscall_kern.o syscall_user.o time.o tlb.o trap_user.o \ sys-$(SUBARCH)/ +host-progs := util/mk_ptregs +clean-files := include/skas_ptregs.h + USER_OBJS = $(filter %_user.o,$(obj-y)) process.o time.o USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) -include/skas_ptregs.h : util/mk_ptregs - util/mk_ptregs > $@ - -util/mk_ptregs : - $(MAKE) -C util +$(TOPDIR)/arch/um/include/skas_ptregs.h : $(src)/util/mk_ptregs + @echo -n ' Generating $@' + @$< > $@.tmp + @if [ -r $@ ] && cmp -s $@ $@.tmp; then \ + echo ' (unchanged)'; \ + rm -f $@.tmp; \ + else \ + echo ' (updated)'; \ + mv -f $@.tmp $@; \ + fi $(USER_OBJS) : %.o: %.c $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< - -clean : - $(MAKE) -C util clean - $(RM) -f include/skas_ptregs.h diff -Nru a/arch/um/kernel/skas/Makefile~uml-summa.diff b/arch/um/kernel/skas/Makefile~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/skas/Makefile~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,24 @@ +# +# Copyright (C) 2002 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +obj-y = exec_kern.o exec_user.o mem.o mem_user.o mmu.o process.o \ + process_kern.o syscall_kern.o syscall_user.o time.o tlb.o trap_user.o \ + sys-$(SUBARCH)/ + +USER_OBJS = $(filter %_user.o,$(obj-y)) process.o time.o +USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) + +include/skas_ptregs.h : util/mk_ptregs + util/mk_ptregs > $@ + +util/mk_ptregs : + $(MAKE) -C util + +$(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< + +clean : + $(MAKE) -C util clean + $(RM) -f include/skas_ptregs.h diff -Nru a/arch/um/kernel/skas/include/mode.h b/arch/um/kernel/skas/include/mode.h --- a/arch/um/kernel/skas/include/mode.h Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/skas/include/mode.h Fri Oct 31 14:10:54 2003 @@ -20,6 +20,7 @@ extern void halt_skas(void); extern void reboot_skas(void); extern void kill_off_processes_skas(void); +extern int is_skas_winch(int pid, int fd, void *data); #endif diff -Nru a/arch/um/kernel/skas/include/mode.h~uml-summa.diff b/arch/um/kernel/skas/include/mode.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/skas/include/mode.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __MODE_SKAS_H__ +#define __MODE_SKAS_H__ + +extern unsigned long exec_regs[]; +extern unsigned long exec_fp_regs[]; +extern unsigned long exec_fpx_regs[]; +extern int have_fpx_regs; + +extern void user_time_init_skas(void); +extern int copy_sc_from_user_skas(union uml_pt_regs *regs, void *from_ptr); +extern int copy_sc_to_user_skas(void *to_ptr, void *fp, + union uml_pt_regs *regs, + unsigned long fault_addr, int fault_type); +extern void sig_handler_common_skas(int sig, void *sc_ptr); +extern void halt_skas(void); +extern void reboot_skas(void); +extern void kill_off_processes_skas(void); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/skas/include/uaccess.h b/arch/um/kernel/skas/include/uaccess.h --- a/arch/um/kernel/skas/include/uaccess.h Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/skas/include/uaccess.h Fri Oct 31 14:10:53 2003 @@ -19,7 +19,7 @@ #define access_ok_skas(type, addr, size) \ ((segment_eq(get_fs(), KERNEL_DS)) || \ (((unsigned long) (addr) < TASK_SIZE) && \ - ((unsigned long) (addr) + (size) < TASK_SIZE))) + ((unsigned long) (addr) + (size) <= TASK_SIZE))) static inline int verify_area_skas(int type, const void * addr, unsigned long size) diff -Nru a/arch/um/kernel/skas/include/uaccess.h~uml-summa.diff b/arch/um/kernel/skas/include/uaccess.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/skas/include/uaccess.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __SKAS_UACCESS_H +#define __SKAS_UACCESS_H + +#include "linux/string.h" +#include "linux/sched.h" +#include "linux/err.h" +#include "asm/processor.h" +#include "asm/pgtable.h" +#include "asm/errno.h" +#include "asm/current.h" +#include "asm/a.out.h" +#include "kern_util.h" + +#define access_ok_skas(type, addr, size) \ + ((segment_eq(get_fs(), KERNEL_DS)) || \ + (((unsigned long) (addr) < TASK_SIZE) && \ + ((unsigned long) (addr) + (size) < TASK_SIZE))) + +static inline int verify_area_skas(int type, const void * addr, + unsigned long size) +{ + return(access_ok_skas(type, addr, size) ? 0 : -EFAULT); +} + +static inline unsigned long maybe_map(unsigned long virt, int is_write) +{ + pte_t pte; + + void *phys = um_virt_to_phys(current, virt, &pte); + int dummy_code; + + if(IS_ERR(phys) || (is_write && !pte_write(pte))){ + if(handle_page_fault(virt, 0, is_write, 0, &dummy_code)) + return(0); + phys = um_virt_to_phys(current, virt, NULL); + } + return((unsigned long) __va((unsigned long) phys)); +} + +static inline int buffer_op(unsigned long addr, int len, + int (*op)(unsigned long addr, int len, void *arg), + void *arg) +{ + int size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len); + int remain = len, n; + + n = (*op)(addr, size, arg); + if(n != 0) + return(n < 0 ? remain : 0); + + addr += size; + remain -= size; + if(remain == 0) + return(0); + + while(addr < ((addr + remain) & PAGE_MASK)){ + n = (*op)(addr, PAGE_SIZE, arg); + if(n != 0) + return(n < 0 ? remain : 0); + + addr += PAGE_SIZE; + remain -= PAGE_SIZE; + } + if(remain == 0) + return(0); + + n = (*op)(addr, remain, arg); + if(n != 0) + return(n < 0 ? remain : 0); + return(0); +} + +static inline int copy_chunk_from_user(unsigned long from, int len, void *arg) +{ + unsigned long *to_ptr = arg, to = *to_ptr; + + from = maybe_map(from, 0); + if(from == 0) + return(-1); + + memcpy((void *) to, (void *) from, len); + *to_ptr += len; + return(0); +} + +static inline int copy_from_user_skas(void *to, const void *from, int n) +{ + if(segment_eq(get_fs(), KERNEL_DS)){ + memcpy(to, from, n); + return(0); + } + + return(access_ok_skas(VERIFY_READ, from, n) ? + buffer_op((unsigned long) from, n, copy_chunk_from_user, &to) : + n); +} + +static inline int copy_chunk_to_user(unsigned long to, int len, void *arg) +{ + unsigned long *from_ptr = arg, from = *from_ptr; + + to = maybe_map(to, 1); + if(to == 0) + return(-1); + + memcpy((void *) to, (void *) from, len); + *from_ptr += len; + return(0); +} + +static inline int copy_to_user_skas(void *to, const void *from, int n) +{ + if(segment_eq(get_fs(), KERNEL_DS)){ + memcpy(to, from, n); + return(0); + } + + return(access_ok_skas(VERIFY_WRITE, to, n) ? + buffer_op((unsigned long) to, n, copy_chunk_to_user, &from) : + n); +} + +static inline int strncpy_chunk_from_user(unsigned long from, int len, + void *arg) +{ + char **to_ptr = arg, *to = *to_ptr; + int n; + + from = maybe_map(from, 0); + if(from == 0) + return(-1); + + strncpy(to, (void *) from, len); + n = strnlen(to, len); + *to_ptr += n; + + if(n < len) + return(1); + return(0); +} + +static inline int strncpy_from_user_skas(char *dst, const char *src, int count) +{ + int n; + char *ptr = dst; + + if(segment_eq(get_fs(), KERNEL_DS)){ + strncpy(dst, src, count); + return(strnlen(dst, count)); + } + + if(!access_ok_skas(VERIFY_READ, src, 1)) + return(-EFAULT); + + n = buffer_op((unsigned long) src, count, strncpy_chunk_from_user, + &ptr); + if(n != 0) + return(-EFAULT); + return(strnlen(dst, count)); +} + +static inline int clear_chunk(unsigned long addr, int len, void *unused) +{ + addr = maybe_map(addr, 1); + if(addr == 0) + return(-1); + + memset((void *) addr, 0, len); + return(0); +} + +static inline int __clear_user_skas(void *mem, int len) +{ + return(buffer_op((unsigned long) mem, len, clear_chunk, NULL)); +} + +static inline int clear_user_skas(void *mem, int len) +{ + if(segment_eq(get_fs(), KERNEL_DS)){ + memset(mem, 0, len); + return(0); + } + + return(access_ok_skas(VERIFY_WRITE, mem, len) ? + buffer_op((unsigned long) mem, len, clear_chunk, NULL) : len); +} + +static inline int strnlen_chunk(unsigned long str, int len, void *arg) +{ + int *len_ptr = arg, n; + + str = maybe_map(str, 0); + if(str == 0) + return(-1); + + n = strnlen((void *) str, len); + *len_ptr += n; + + if(n < len) + return(1); + return(0); +} + +static inline int strnlen_user_skas(const void *str, int len) +{ + int count = 0, n; + + if(segment_eq(get_fs(), KERNEL_DS)) + return(strnlen(str, len) + 1); + + n = buffer_op((unsigned long) str, len, strnlen_chunk, &count); + if(n == 0) + return(count + 1); + return(-EFAULT); +} + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c --- a/arch/um/kernel/skas/process.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/skas/process.c Fri Oct 31 14:10:54 2003 @@ -4,6 +4,7 @@ */ #include <stdlib.h> +#include <unistd.h> #include <errno.h> #include <signal.h> #include <setjmp.h> @@ -24,6 +25,16 @@ #include "os.h" #include "proc_mm.h" #include "skas_ptrace.h" +#include "chan_user.h" + +int is_skas_winch(int pid, int fd, void *data) +{ + if(pid != getpid()) + return(0); + + register_winch_irq(-1, fd, -1, data); + return(1); +} unsigned long exec_regs[FRAME_SIZE]; unsigned long exec_fp_regs[HOST_FP_SIZE]; @@ -48,11 +59,11 @@ int err, syscall_nr, status; syscall_nr = PT_SYSCALL_NR(regs->skas.regs); + UPT_SYSCALL_NR(regs) = syscall_nr; if(syscall_nr < 1){ relay_signal(SIGTRAP, regs); return; } - UPT_SYSCALL_NR(regs) = syscall_nr; err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid); if(err < 0) @@ -72,8 +83,6 @@ handle_syscall(regs); } -int userspace_pid; - static int userspace_tramp(void *arg) { init_new_thread_signals(0); @@ -83,6 +92,8 @@ return(0); } +int userspace_pid; + void start_userspace(void) { void *stack; @@ -149,6 +160,7 @@ case SIGILL: case SIGBUS: case SIGFPE: + case SIGWINCH: user_signal(WSTOPSIG(status), regs); break; default: @@ -172,12 +184,12 @@ void new_thread(void *stack, void **switch_buf_ptr, void **fork_buf_ptr, void (*handler)(int)) { - jmp_buf switch_buf, fork_buf; + sigjmp_buf switch_buf, fork_buf; *switch_buf_ptr = &switch_buf; *fork_buf_ptr = &fork_buf; - if(setjmp(fork_buf) == 0) + if(sigsetjmp(fork_buf,1) == 0) new_thread_proc(stack, handler); remove_sigstack(); @@ -185,12 +197,12 @@ void thread_wait(void *sw, void *fb) { - jmp_buf buf, **switch_buf = sw, *fork_buf; + sigjmp_buf buf, **switch_buf = sw, *fork_buf; *switch_buf = &buf; fork_buf = fb; - if(setjmp(buf) == 0) - longjmp(*fork_buf, 1); + if(sigsetjmp(buf,1) == 0) + siglongjmp(*fork_buf, 1); } static int move_registers(int int_op, int fp_op, union uml_pt_regs *regs, @@ -245,34 +257,34 @@ void switch_threads(void *me, void *next) { - jmp_buf my_buf, **me_ptr = me, *next_buf = next; + sigjmp_buf my_buf, **me_ptr = me, *next_buf = next; *me_ptr = &my_buf; - if(setjmp(my_buf) == 0) - longjmp(*next_buf, 1); + if(sigsetjmp(my_buf,1) == 0) + siglongjmp(*next_buf, 1); } -static jmp_buf initial_jmpbuf; +static sigjmp_buf initial_jmpbuf; /* XXX Make these percpu */ static void (*cb_proc)(void *arg); static void *cb_arg; -static jmp_buf *cb_back; +static sigjmp_buf *cb_back; int start_idle_thread(void *stack, void *switch_buf_ptr, void **fork_buf_ptr) { - jmp_buf **switch_buf = switch_buf_ptr; + sigjmp_buf **switch_buf = switch_buf_ptr; int n; *fork_buf_ptr = &initial_jmpbuf; - n = setjmp(initial_jmpbuf); + n = sigsetjmp(initial_jmpbuf,1); if(n == 0) new_thread_proc((void *) stack, new_thread_handler); else if(n == 1) remove_sigstack(); else if(n == 2){ (*cb_proc)(cb_arg); - longjmp(*cb_back, 1); + siglongjmp(*cb_back, 1); } else if(n == 3){ kmalloc_ok = 0; @@ -282,7 +294,7 @@ kmalloc_ok = 0; return(1); } - longjmp(**switch_buf, 1); + siglongjmp(**switch_buf, 1); } void remove_sigstack(void) @@ -297,15 +309,15 @@ void initial_thread_cb_skas(void (*proc)(void *), void *arg) { - jmp_buf here; + sigjmp_buf here; cb_proc = proc; cb_arg = arg; cb_back = &here; block_signals(); - if(setjmp(here) == 0) - longjmp(initial_jmpbuf, 2); + if(sigsetjmp(here,1) == 0) + siglongjmp(initial_jmpbuf, 2); unblock_signals(); cb_proc = NULL; @@ -316,19 +328,20 @@ void halt_skas(void) { block_signals(); - longjmp(initial_jmpbuf, 3); + siglongjmp(initial_jmpbuf, 3); } void reboot_skas(void) { block_signals(); - longjmp(initial_jmpbuf, 4); + siglongjmp(initial_jmpbuf, 4); } int new_mm(int from) { struct proc_mm_op copy; - int n, fd = os_open_file("/proc/mm", of_write(OPENFLAGS()), 0); + int n, fd = os_open_file("/proc/mm", + of_cloexec(of_write(OPENFLAGS())), 0); if(fd < 0) return(-errno); @@ -342,6 +355,7 @@ printk("new_mm : /proc/mm copy_segments failed, " "errno = %d\n", errno); } + return(fd); } diff -Nru a/arch/um/kernel/skas/process.c~uml-summa.diff b/arch/um/kernel/skas/process.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/skas/process.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,395 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdlib.h> +#include <errno.h> +#include <signal.h> +#include <setjmp.h> +#include <sched.h> +#include <sys/wait.h> +#include <sys/ptrace.h> +#include <sys/mman.h> +#include <sys/user.h> +#include <asm/unistd.h> +#include "user.h" +#include "ptrace_user.h" +#include "time_user.h" +#include "sysdep/ptrace.h" +#include "user_util.h" +#include "kern_util.h" +#include "skas.h" +#include "sysdep/sigcontext.h" +#include "os.h" +#include "proc_mm.h" +#include "skas_ptrace.h" + +unsigned long exec_regs[FRAME_SIZE]; +unsigned long exec_fp_regs[HOST_FP_SIZE]; +unsigned long exec_fpx_regs[HOST_XFP_SIZE]; +int have_fpx_regs = 1; + +static void handle_segv(int pid) +{ + struct ptrace_faultinfo fault; + int err; + + err = ptrace(PTRACE_FAULTINFO, pid, 0, &fault); + if(err) + panic("handle_segv - PTRACE_FAULTINFO failed, errno = %d\n", + errno); + + segv(fault.addr, 0, FAULT_WRITE(fault.is_write), 1, NULL); +} + +static void handle_trap(int pid, union uml_pt_regs *regs) +{ + int err, syscall_nr, status; + + syscall_nr = PT_SYSCALL_NR(regs->skas.regs); + if(syscall_nr < 1){ + relay_signal(SIGTRAP, regs); + return; + } + UPT_SYSCALL_NR(regs) = syscall_nr; + + err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid); + if(err < 0) + panic("handle_trap - nullifying syscall failed errno = %d\n", + errno); + + err = ptrace(PTRACE_SYSCALL, pid, 0, 0); + if(err < 0) + panic("handle_trap - continuing to end of syscall failed, " + "errno = %d\n", errno); + + err = waitpid(pid, &status, WUNTRACED); + if((err < 0) || !WIFSTOPPED(status) || (WSTOPSIG(status) != SIGTRAP)) + panic("handle_trap - failed to wait at end of syscall, " + "errno = %d, status = %d\n", errno, status); + + handle_syscall(regs); +} + +int userspace_pid; + +static int userspace_tramp(void *arg) +{ + init_new_thread_signals(0); + enable_timer(); + ptrace(PTRACE_TRACEME, 0, 0, 0); + os_stop_process(os_getpid()); + return(0); +} + +void start_userspace(void) +{ + void *stack; + unsigned long sp; + int pid, status, n; + + stack = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if(stack == MAP_FAILED) + panic("start_userspace : mmap failed, errno = %d", errno); + sp = (unsigned long) stack + PAGE_SIZE - sizeof(void *); + + pid = clone(userspace_tramp, (void *) sp, + CLONE_FILES | CLONE_VM | SIGCHLD, NULL); + if(pid < 0) + panic("start_userspace : clone failed, errno = %d", errno); + + do { + n = waitpid(pid, &status, WUNTRACED); + if(n < 0) + panic("start_userspace : wait failed, errno = %d", + errno); + } while(WIFSTOPPED(status) && (WSTOPSIG(status) == SIGVTALRM)); + + if(!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) + panic("start_userspace : expected SIGSTOP, got status = %d", + status); + + if(munmap(stack, PAGE_SIZE) < 0) + panic("start_userspace : munmap failed, errno = %d\n", errno); + + userspace_pid = pid; +} + +void userspace(union uml_pt_regs *regs) +{ + int err, status, op; + + restore_registers(regs); + + err = ptrace(PTRACE_SYSCALL, userspace_pid, 0, 0); + if(err) + panic("userspace - PTRACE_SYSCALL failed, errno = %d\n", + errno); + while(1){ + err = waitpid(userspace_pid, &status, WUNTRACED); + if(err < 0) + panic("userspace - waitpid failed, errno = %d\n", + errno); + + regs->skas.is_user = 1; + save_registers(regs); + + if(WIFSTOPPED(status)){ + switch(WSTOPSIG(status)){ + case SIGSEGV: + handle_segv(userspace_pid); + break; + case SIGTRAP: + handle_trap(userspace_pid, regs); + break; + case SIGIO: + case SIGVTALRM: + case SIGILL: + case SIGBUS: + case SIGFPE: + user_signal(WSTOPSIG(status), regs); + break; + default: + printk("userspace - child stopped with signal " + "%d\n", WSTOPSIG(status)); + } + interrupt_end(); + } + + restore_registers(regs); + + op = singlestepping_skas() ? PTRACE_SINGLESTEP : + PTRACE_SYSCALL; + err = ptrace(op, userspace_pid, 0, 0); + if(err) + panic("userspace - PTRACE_SYSCALL failed, " + "errno = %d\n", errno); + } +} + +void new_thread(void *stack, void **switch_buf_ptr, void **fork_buf_ptr, + void (*handler)(int)) +{ + jmp_buf switch_buf, fork_buf; + + *switch_buf_ptr = &switch_buf; + *fork_buf_ptr = &fork_buf; + + if(setjmp(fork_buf) == 0) + new_thread_proc(stack, handler); + + remove_sigstack(); +} + +void thread_wait(void *sw, void *fb) +{ + jmp_buf buf, **switch_buf = sw, *fork_buf; + + *switch_buf = &buf; + fork_buf = fb; + if(setjmp(buf) == 0) + longjmp(*fork_buf, 1); +} + +static int move_registers(int int_op, int fp_op, union uml_pt_regs *regs, + unsigned long *fp_regs) +{ + if(ptrace(int_op, userspace_pid, 0, regs->skas.regs) < 0) + return(-errno); + if(ptrace(fp_op, userspace_pid, 0, fp_regs) < 0) + return(-errno); + return(0); +} + +void save_registers(union uml_pt_regs *regs) +{ + unsigned long *fp_regs; + int err, fp_op; + + if(have_fpx_regs){ + fp_op = PTRACE_GETFPXREGS; + fp_regs = regs->skas.xfp; + } + else { + fp_op = PTRACE_GETFPREGS; + fp_regs = regs->skas.fp; + } + + err = move_registers(PTRACE_GETREGS, fp_op, regs, fp_regs); + if(err) + panic("save_registers - saving registers failed, errno = %d\n", + err); +} + +void restore_registers(union uml_pt_regs *regs) +{ + unsigned long *fp_regs; + int err, fp_op; + + if(have_fpx_regs){ + fp_op = PTRACE_SETFPXREGS; + fp_regs = regs->skas.xfp; + } + else { + fp_op = PTRACE_SETFPREGS; + fp_regs = regs->skas.fp; + } + + err = move_registers(PTRACE_SETREGS, fp_op, regs, fp_regs); + if(err) + panic("restore_registers - saving registers failed, " + "errno = %d\n", err); +} + +void switch_threads(void *me, void *next) +{ + jmp_buf my_buf, **me_ptr = me, *next_buf = next; + + *me_ptr = &my_buf; + if(setjmp(my_buf) == 0) + longjmp(*next_buf, 1); +} + +static jmp_buf initial_jmpbuf; + +/* XXX Make these percpu */ +static void (*cb_proc)(void *arg); +static void *cb_arg; +static jmp_buf *cb_back; + +int start_idle_thread(void *stack, void *switch_buf_ptr, void **fork_buf_ptr) +{ + jmp_buf **switch_buf = switch_buf_ptr; + int n; + + *fork_buf_ptr = &initial_jmpbuf; + n = setjmp(initial_jmpbuf); + if(n == 0) + new_thread_proc((void *) stack, new_thread_handler); + else if(n == 1) + remove_sigstack(); + else if(n == 2){ + (*cb_proc)(cb_arg); + longjmp(*cb_back, 1); + } + else if(n == 3){ + kmalloc_ok = 0; + return(0); + } + else if(n == 4){ + kmalloc_ok = 0; + return(1); + } + longjmp(**switch_buf, 1); +} + +void remove_sigstack(void) +{ + stack_t stack = ((stack_t) { .ss_flags = SS_DISABLE, + .ss_sp = NULL, + .ss_size = 0 }); + + if(sigaltstack(&stack, NULL) != 0) + panic("disabling signal stack failed, errno = %d\n", errno); +} + +void initial_thread_cb_skas(void (*proc)(void *), void *arg) +{ + jmp_buf here; + + cb_proc = proc; + cb_arg = arg; + cb_back = &here; + + block_signals(); + if(setjmp(here) == 0) + longjmp(initial_jmpbuf, 2); + unblock_signals(); + + cb_proc = NULL; + cb_arg = NULL; + cb_back = NULL; +} + +void halt_skas(void) +{ + block_signals(); + longjmp(initial_jmpbuf, 3); +} + +void reboot_skas(void) +{ + block_signals(); + longjmp(initial_jmpbuf, 4); +} + +int new_mm(int from) +{ + struct proc_mm_op copy; + int n, fd = os_open_file("/proc/mm", of_write(OPENFLAGS()), 0); + + if(fd < 0) + return(-errno); + + if(from != -1){ + copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS, + .u = + { .copy_segments = from } } ); + n = os_write_file(fd, ©, sizeof(copy)); + if(n != sizeof(copy)) + printk("new_mm : /proc/mm copy_segments failed, " + "errno = %d\n", errno); + } + return(fd); +} + +void switch_mm_skas(int mm_fd) +{ + int err; + + err = ptrace(PTRACE_SWITCH_MM, userspace_pid, 0, mm_fd); + if(err) + panic("switch_mm_skas - PTRACE_SWITCH_MM failed, errno = %d\n", + errno); +} + +void kill_off_processes_skas(void) +{ + os_kill_process(userspace_pid, 1); +} + +void init_registers(int pid) +{ + int err; + + if(ptrace(PTRACE_GETREGS, pid, 0, exec_regs) < 0) + panic("check_ptrace : PTRACE_GETREGS failed, errno = %d", + errno); + + err = ptrace(PTRACE_GETFPXREGS, pid, 0, exec_fpx_regs); + if(!err) + return; + + have_fpx_regs = 0; + if(errno != EIO) + panic("check_ptrace : PTRACE_GETFPXREGS failed, errno = %d", + errno); + + err = ptrace(PTRACE_GETFPREGS, pid, 0, exec_fp_regs); + if(err) + panic("check_ptrace : PTRACE_GETFPREGS failed, errno = %d", + errno); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/skas/process_kern.c b/arch/um/kernel/skas/process_kern.c --- a/arch/um/kernel/skas/process_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/skas/process_kern.c Fri Oct 31 14:10:53 2003 @@ -61,9 +61,8 @@ thread_wait(¤t->thread.mode.skas.switch_buf, current->thread.mode.skas.fork_buf); -#ifdef CONFIG_SMP - schedule_tail(NULL); -#endif + if(current->thread.prev_sched != NULL) + schedule_tail(current->thread.prev_sched); current->thread.prev_sched = NULL; n = run_kernel_thread(fn, arg, ¤t->thread.exec_buf); @@ -93,9 +92,8 @@ current->thread.mode.skas.fork_buf); force_flush_all(); -#ifdef CONFIG_SMP - schedule_tail(current->thread.prev_sched); -#endif + if(current->thread.prev_sched != NULL) + schedule_tail(current->thread.prev_sched); current->thread.prev_sched = NULL; unblock_signals(); @@ -136,7 +134,7 @@ void init_idle_skas(void) { - cpu_tasks[current->thread_info->cpu].pid = os_getpid(); + cpu_tasks[current_thread->cpu].pid = os_getpid(); default_idle(); } @@ -164,7 +162,7 @@ capture_signal_stack(); init_new_thread_signals(1); - idle_timer(); + uml_idle_timer(); init_task.thread.request.u.thread.proc = start_kernel_proc; init_task.thread.request.u.thread.arg = NULL; diff -Nru a/arch/um/kernel/skas/process_kern.c~uml-summa.diff b/arch/um/kernel/skas/process_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/skas/process_kern.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/slab.h" +#include "linux/ptrace.h" +#include "kern_util.h" +#include "time_user.h" +#include "signal_user.h" +#include "skas.h" +#include "os.h" +#include "user_util.h" +#include "tlb.h" +#include "frame.h" +#include "kern.h" +#include "mode.h" + +int singlestepping_skas(void) +{ + int ret = current->ptrace & PT_DTRACE; + + current->ptrace &= ~PT_DTRACE; + return(ret); +} + +void *switch_to_skas(void *prev, void *next) +{ + struct task_struct *from, *to; + + from = prev; + to = next; + + /* XXX need to check runqueues[cpu].idle */ + if(current->pid == 0) + switch_timers(0); + + to->thread.prev_sched = from; + set_current(to); + + switch_threads(&from->thread.mode.skas.switch_buf, + to->thread.mode.skas.switch_buf); + + if(current->pid == 0) + switch_timers(1); + + return(current->thread.prev_sched); +} + +extern void schedule_tail(struct task_struct *prev); + +void new_thread_handler(int sig) +{ + int (*fn)(void *), n; + void *arg; + + fn = current->thread.request.u.thread.proc; + arg = current->thread.request.u.thread.arg; + change_sig(SIGUSR1, 1); + thread_wait(¤t->thread.mode.skas.switch_buf, + current->thread.mode.skas.fork_buf); + +#ifdef CONFIG_SMP + schedule_tail(NULL); +#endif + current->thread.prev_sched = NULL; + + n = run_kernel_thread(fn, arg, ¤t->thread.exec_buf); + if(n == 1) + userspace(¤t->thread.regs.regs); + else do_exit(0); +} + +void new_thread_proc(void *stack, void (*handler)(int sig)) +{ + init_new_thread_stack(stack, handler); + os_usr1_process(os_getpid()); +} + +void release_thread_skas(struct task_struct *task) +{ +} + +void exit_thread_skas(void) +{ +} + +void fork_handler(int sig) +{ + change_sig(SIGUSR1, 1); + thread_wait(¤t->thread.mode.skas.switch_buf, + current->thread.mode.skas.fork_buf); + + force_flush_all(); +#ifdef CONFIG_SMP + schedule_tail(current->thread.prev_sched); +#endif + current->thread.prev_sched = NULL; + unblock_signals(); + + userspace(¤t->thread.regs.regs); +} + +int copy_thread_skas(int nr, unsigned long clone_flags, unsigned long sp, + unsigned long stack_top, struct task_struct * p, + struct pt_regs *regs) +{ + void (*handler)(int); + + if(current->thread.forking){ + memcpy(&p->thread.regs.regs.skas, + ¤t->thread.regs.regs.skas, + sizeof(p->thread.regs.regs.skas)); + REGS_SET_SYSCALL_RETURN(p->thread.regs.regs.skas.regs, 0); + if(sp != 0) REGS_SP(p->thread.regs.regs.skas.regs) = sp; + + handler = fork_handler; + } + else { + memcpy(p->thread.regs.regs.skas.regs, exec_regs, + sizeof(p->thread.regs.regs.skas.regs)); + memcpy(p->thread.regs.regs.skas.fp, exec_fp_regs, + sizeof(p->thread.regs.regs.skas.fp)); + memcpy(p->thread.regs.regs.skas.xfp, exec_fpx_regs, + sizeof(p->thread.regs.regs.skas.xfp)); + p->thread.request.u.thread = current->thread.request.u.thread; + handler = new_thread_handler; + } + + new_thread((void *) p->thread.kernel_stack, + &p->thread.mode.skas.switch_buf, + &p->thread.mode.skas.fork_buf, handler); + return(0); +} + +void init_idle_skas(void) +{ + cpu_tasks[current->thread_info->cpu].pid = os_getpid(); + default_idle(); +} + +extern void start_kernel(void); + +static int start_kernel_proc(void *unused) +{ + int pid; + + block_signals(); + pid = os_getpid(); + + cpu_tasks[0].pid = pid; + cpu_tasks[0].task = current; +#ifdef CONFIG_SMP + cpu_online_map = cpumask_of_cpu(0); +#endif + start_kernel(); + return(0); +} + +int start_uml_skas(void) +{ + start_userspace(); + capture_signal_stack(); + + init_new_thread_signals(1); + idle_timer(); + + init_task.thread.request.u.thread.proc = start_kernel_proc; + init_task.thread.request.u.thread.arg = NULL; + return(start_idle_thread((void *) init_task.thread.kernel_stack, + &init_task.thread.mode.skas.switch_buf, + &init_task.thread.mode.skas.fork_buf)); +} + +int external_pid_skas(struct task_struct *task) +{ + return(userspace_pid); +} + +int thread_pid_skas(struct task_struct *task) +{ + return(userspace_pid); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/skas/util/mk_ptregs.c b/arch/um/kernel/skas/util/mk_ptregs.c --- a/arch/um/kernel/skas/util/mk_ptregs.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/skas/util/mk_ptregs.c Fri Oct 31 14:10:53 2003 @@ -1,3 +1,4 @@ +#include <stdio.h> #include <asm/ptrace.h> #include <asm/user.h> diff -Nru a/arch/um/kernel/skas/util/mk_ptregs.c~uml-summa.diff b/arch/um/kernel/skas/util/mk_ptregs.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/skas/util/mk_ptregs.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,50 @@ +#include <asm/ptrace.h> +#include <asm/user.h> + +#define PRINT_REG(name, val) printf("#define HOST_%s %d\n", (name), (val)) + +int main(int argc, char **argv) +{ + printf("/* Automatically generated by " + "arch/um/kernel/skas/util/mk_ptregs */\n"); + printf("\n"); + printf("#ifndef __SKAS_PT_REGS_\n"); + printf("#define __SKAS_PT_REGS_\n"); + printf("\n"); + printf("#define HOST_FRAME_SIZE %d\n", FRAME_SIZE); + printf("#define HOST_FP_SIZE %d\n", + sizeof(struct user_i387_struct) / sizeof(unsigned long)); + printf("#define HOST_XFP_SIZE %d\n", + sizeof(struct user_fxsr_struct) / sizeof(unsigned long)); + + PRINT_REG("IP", EIP); + PRINT_REG("SP", UESP); + PRINT_REG("EFLAGS", EFL); + PRINT_REG("EAX", EAX); + PRINT_REG("EBX", EBX); + PRINT_REG("ECX", ECX); + PRINT_REG("EDX", EDX); + PRINT_REG("ESI", ESI); + PRINT_REG("EDI", EDI); + PRINT_REG("EBP", EBP); + PRINT_REG("CS", CS); + PRINT_REG("SS", SS); + PRINT_REG("DS", DS); + PRINT_REG("FS", FS); + PRINT_REG("ES", ES); + PRINT_REG("GS", GS); + printf("\n"); + printf("#endif\n"); + return(0); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c --- a/arch/um/kernel/smp.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/smp.c Fri Oct 31 14:10:53 2003 @@ -23,7 +23,7 @@ #include "os.h" /* CPU online map, set by smp_boot_cpus */ -unsigned long cpu_online_map = cpumask_of_cpu(0); +unsigned long cpu_online_map = CPU_MASK_NONE; EXPORT_SYMBOL(cpu_online_map); @@ -100,15 +100,15 @@ printk(KERN_INFO "Stopping all CPUs..."); for(i = 0; i < num_online_cpus(); i++){ - if(i == current->thread_info->cpu) + if(i == current_thread->cpu) continue; write(cpu_data[i].ipi_pipe[1], "S", 1); } printk("done\n"); } -static cpumask_t smp_commenced_mask; -static cpumask_t smp_callin_map = CPU_MASK_NONE; +static cpumask_t smp_commenced_mask = CPU_MASK_NONE; +static cpumask_t cpu_callin_map = CPU_MASK_NONE; static int idle_proc(void *cpup) { @@ -123,12 +123,12 @@ current->thread.mode.tt.extern_pid); wmb(); - if (cpu_test_and_set(cpu, &smp_callin_map)) { + if (cpu_test_and_set(cpu, cpu_callin_map)) { printk("huh, CPU#%d already present??\n", cpu); BUG(); } - while (!cpu_isset(cpu, &smp_commenced_mask)) + while (!cpu_isset(cpu, smp_commenced_mask)) cpu_relax(); cpu_set(cpu, cpu_online_map); @@ -143,8 +143,11 @@ current->thread.request.u.thread.proc = idle_proc; current->thread.request.u.thread.arg = (void *) cpu; - new_task = do_fork(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, NULL); - if(IS_ERR(new_task)) panic("do_fork failed in idle_thread"); + new_task = copy_process(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, + NULL); + if(IS_ERR(new_task)) + panic("copy_process failed in idle_thread, error = %ld", + PTR_ERR(new_task)); cpu_tasks[cpu] = ((struct cpu_task) { .pid = new_task->thread.mode.tt.extern_pid, @@ -153,6 +156,7 @@ CHOOSE_MODE(write(new_task->thread.mode.tt.switch_pipe[1], &c, sizeof(c)), ({ panic("skas mode doesn't support SMP"); })); + wake_up_forked_process(new_task); return(new_task); } @@ -160,15 +164,16 @@ { struct task_struct *idle; unsigned long waittime; - int err, cpu; + int err, cpu, me = smp_processor_id(); - cpu_set(0, cpu_online_map); - cpu_set(0, smp_callin_map); + cpu_clear(me, cpu_online_map); + cpu_set(me, cpu_online_map); + cpu_set(me, cpu_callin_map); - err = os_pipe(cpu_data[0].ipi_pipe, 1, 1); + err = os_pipe(cpu_data[me].ipi_pipe, 1, 1); if(err) panic("CPU#0 failed to create IPI pipe, errno = %d", -err); - activate_ipi(cpu_data[0].ipi_pipe[0], + activate_ipi(cpu_data[me].ipi_pipe[0], current->thread.mode.tt.extern_pid); for(cpu = 1; cpu < ncpus; cpu++){ @@ -180,10 +185,10 @@ unhash_process(idle); waittime = 200000000; - while (waittime-- && !cpu_isset(cpu, smp_callin_map)) + while (waittime-- && !cpu_isset(cpu, cpu_callin_map)) cpu_relax(); - if (cpu_isset(cpu, smp_callin_map)) + if (cpu_isset(cpu, cpu_callin_map)) printk("done\n"); else printk("failed\n"); } @@ -273,7 +278,7 @@ info = _info; for (i=0;i<NR_CPUS;i++) - if((i != current->thread_info->cpu) && + if((i != current_thread->cpu) && cpu_isset(i, cpu_online_map)) write(cpu_data[i].ipi_pipe[1], "C", 1); diff -Nru a/arch/um/kernel/smp.c~uml-summa.diff b/arch/um/kernel/smp.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/smp.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,302 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" + +#ifdef CONFIG_SMP + +#include "linux/sched.h" +#include "linux/module.h" +#include "linux/threads.h" +#include "linux/interrupt.h" +#include "linux/err.h" +#include "asm/smp.h" +#include "asm/processor.h" +#include "asm/spinlock.h" +#include "asm/hardirq.h" +#include "user_util.h" +#include "kern_util.h" +#include "kern.h" +#include "irq_user.h" +#include "os.h" + +/* CPU online map, set by smp_boot_cpus */ +unsigned long cpu_online_map = cpumask_of_cpu(0); + +EXPORT_SYMBOL(cpu_online_map); + +/* Per CPU bogomips and other parameters + * The only piece used here is the ipi pipe, which is set before SMP is + * started and never changed. + */ +struct cpuinfo_um cpu_data[NR_CPUS]; + +spinlock_t um_bh_lock = SPIN_LOCK_UNLOCKED; + +atomic_t global_bh_count; + +/* Not used by UML */ +unsigned char global_irq_holder = NO_PROC_ID; +unsigned volatile long global_irq_lock; + +/* Set when the idlers are all forked */ +int smp_threads_ready = 0; + +/* A statistic, can be a little off */ +int num_reschedules_sent = 0; + +/* Small, random number, never changed */ +unsigned long cache_decay_ticks = 5; + +/* Not changed after boot */ +struct task_struct *idle_threads[NR_CPUS]; + +void smp_send_reschedule(int cpu) +{ + write(cpu_data[cpu].ipi_pipe[1], "R", 1); + num_reschedules_sent++; +} + +static void show(char * str) +{ + int cpu = smp_processor_id(); + + printk(KERN_INFO "\n%s, CPU %d:\n", str, cpu); +} + +#define MAXCOUNT 100000000 + +static inline void wait_on_bh(void) +{ + int count = MAXCOUNT; + do { + if (!--count) { + show("wait_on_bh"); + count = ~0; + } + /* nothing .. wait for the other bh's to go away */ + } while (atomic_read(&global_bh_count) != 0); +} + +/* + * This is called when we want to synchronize with + * bottom half handlers. We need to wait until + * no other CPU is executing any bottom half handler. + * + * Don't wait if we're already running in an interrupt + * context or are inside a bh handler. + */ +void synchronize_bh(void) +{ + if (atomic_read(&global_bh_count) && !in_interrupt()) + wait_on_bh(); +} + +void smp_send_stop(void) +{ + int i; + + printk(KERN_INFO "Stopping all CPUs..."); + for(i = 0; i < num_online_cpus(); i++){ + if(i == current->thread_info->cpu) + continue; + write(cpu_data[i].ipi_pipe[1], "S", 1); + } + printk("done\n"); +} + +static cpumask_t smp_commenced_mask; +static cpumask_t smp_callin_map = CPU_MASK_NONE; + +static int idle_proc(void *cpup) +{ + int cpu = (int) cpup, err; + + err = os_pipe(cpu_data[cpu].ipi_pipe, 1, 1); + if(err) + panic("CPU#%d failed to create IPI pipe, errno = %d", cpu, + -err); + + activate_ipi(cpu_data[cpu].ipi_pipe[0], + current->thread.mode.tt.extern_pid); + + wmb(); + if (cpu_test_and_set(cpu, &smp_callin_map)) { + printk("huh, CPU#%d already present??\n", cpu); + BUG(); + } + + while (!cpu_isset(cpu, &smp_commenced_mask)) + cpu_relax(); + + cpu_set(cpu, cpu_online_map); + default_idle(); + return(0); +} + +static struct task_struct *idle_thread(int cpu) +{ + struct task_struct *new_task; + unsigned char c; + + current->thread.request.u.thread.proc = idle_proc; + current->thread.request.u.thread.arg = (void *) cpu; + new_task = do_fork(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, NULL); + if(IS_ERR(new_task)) panic("do_fork failed in idle_thread"); + + cpu_tasks[cpu] = ((struct cpu_task) + { .pid = new_task->thread.mode.tt.extern_pid, + .task = new_task } ); + idle_threads[cpu] = new_task; + CHOOSE_MODE(write(new_task->thread.mode.tt.switch_pipe[1], &c, + sizeof(c)), + ({ panic("skas mode doesn't support SMP"); })); + return(new_task); +} + +void smp_prepare_cpus(unsigned int maxcpus) +{ + struct task_struct *idle; + unsigned long waittime; + int err, cpu; + + cpu_set(0, cpu_online_map); + cpu_set(0, smp_callin_map); + + err = os_pipe(cpu_data[0].ipi_pipe, 1, 1); + if(err) panic("CPU#0 failed to create IPI pipe, errno = %d", -err); + + activate_ipi(cpu_data[0].ipi_pipe[0], + current->thread.mode.tt.extern_pid); + + for(cpu = 1; cpu < ncpus; cpu++){ + printk("Booting processor %d...\n", cpu); + + idle = idle_thread(cpu); + + init_idle(idle, cpu); + unhash_process(idle); + + waittime = 200000000; + while (waittime-- && !cpu_isset(cpu, smp_callin_map)) + cpu_relax(); + + if (cpu_isset(cpu, smp_callin_map)) + printk("done\n"); + else printk("failed\n"); + } +} + +void smp_prepare_boot_cpu(void) +{ + cpu_set(smp_processor_id(), cpu_online_map); +} + +int __cpu_up(unsigned int cpu) +{ + cpu_set(cpu, smp_commenced_mask); + while (!cpu_isset(cpu, cpu_online_map)) + mb(); + return(0); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + printk(KERN_INFO "setup_profiling_timer\n"); + return(0); +} + +void smp_call_function_slave(int cpu); + +void IPI_handler(int cpu) +{ + unsigned char c; + int fd; + + fd = cpu_data[cpu].ipi_pipe[0]; + while (read(fd, &c, 1) == 1) { + switch (c) { + case 'C': + smp_call_function_slave(cpu); + break; + + case 'R': + set_tsk_need_resched(current); + break; + + case 'S': + printk("CPU#%d stopping\n", cpu); + while(1) + pause(); + break; + + default: + printk("CPU#%d received unknown IPI [%c]!\n", cpu, c); + break; + } + } +} + +int hard_smp_processor_id(void) +{ + return(pid_to_processor_id(os_getpid())); +} + +static spinlock_t call_lock = SPIN_LOCK_UNLOCKED; +static atomic_t scf_started; +static atomic_t scf_finished; +static void (*func)(void *info); +static void *info; + +void smp_call_function_slave(int cpu) +{ + atomic_inc(&scf_started); + (*func)(info); + atomic_inc(&scf_finished); +} + +int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic, + int wait) +{ + int cpus = num_online_cpus() - 1; + int i; + + if (!cpus) + return 0; + + spin_lock_bh(&call_lock); + atomic_set(&scf_started, 0); + atomic_set(&scf_finished, 0); + func = _func; + info = _info; + + for (i=0;i<NR_CPUS;i++) + if((i != current->thread_info->cpu) && + cpu_isset(i, cpu_online_map)) + write(cpu_data[i].ipi_pipe[1], "C", 1); + + while (atomic_read(&scf_started) != cpus) + barrier(); + + if (wait) + while (atomic_read(&scf_finished) != cpus) + barrier(); + + spin_unlock_bh(&call_lock); + return 0; +} + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/sys_call_table.c b/arch/um/kernel/sys_call_table.c --- a/arch/um/kernel/sys_call_table.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/sys_call_table.c Fri Oct 31 14:10:54 2003 @@ -219,15 +219,30 @@ extern syscall_handler_t sys_gettid; extern syscall_handler_t sys_readahead; extern syscall_handler_t sys_tkill; +extern syscall_handler_t sys_setxattr; +extern syscall_handler_t sys_lsetxattr; +extern syscall_handler_t sys_fsetxattr; +extern syscall_handler_t sys_getxattr; +extern syscall_handler_t sys_lgetxattr; +extern syscall_handler_t sys_fgetxattr; +extern syscall_handler_t sys_listxattr; +extern syscall_handler_t sys_llistxattr; +extern syscall_handler_t sys_flistxattr; +extern syscall_handler_t sys_removexattr; +extern syscall_handler_t sys_lremovexattr; +extern syscall_handler_t sys_fremovexattr; extern syscall_handler_t sys_sendfile64; extern syscall_handler_t sys_futex; extern syscall_handler_t sys_sched_setaffinity; extern syscall_handler_t sys_sched_getaffinity; +extern syscall_handler_t sys_set_thread_area; +extern syscall_handler_t sys_get_thread_area; extern syscall_handler_t sys_io_setup; extern syscall_handler_t sys_io_destroy; extern syscall_handler_t sys_io_getevents; extern syscall_handler_t sys_io_submit; extern syscall_handler_t sys_io_cancel; +extern syscall_handler_t sys_fadvise64; extern syscall_handler_t sys_exit_group; extern syscall_handler_t sys_lookup_dcookie; extern syscall_handler_t sys_epoll_create; @@ -235,6 +250,21 @@ extern syscall_handler_t sys_epoll_wait; extern syscall_handler_t sys_remap_file_pages; extern syscall_handler_t sys_set_tid_address; +extern syscall_handler_t sys_timer_create; +extern syscall_handler_t sys_timer_settime; +extern syscall_handler_t sys_timer_gettime; +extern syscall_handler_t sys_timer_getoverrun; +extern syscall_handler_t sys_timer_delete; +extern syscall_handler_t sys_clock_settime; +extern syscall_handler_t sys_clock_gettime; +extern syscall_handler_t sys_clock_getres; +extern syscall_handler_t sys_clock_nanosleep; +extern syscall_handler_t sys_statfs64; +extern syscall_handler_t sys_fstatfs64; +extern syscall_handler_t sys_tgkill; +extern syscall_handler_t sys_utimes; +extern syscall_handler_t sys_fadvise64_64; +extern syscall_handler_t sys_reiser4; #ifdef CONFIG_NFSD #define NFSSERVCTL sys_nfsservctl @@ -246,7 +276,7 @@ extern syscall_handler_t um_time; extern syscall_handler_t um_stime; -#define LAST_GENERIC_SYSCALL __NR_set_tid_address +#define LAST_GENERIC_SYSCALL __NR_reiser4 #if LAST_GENERIC_SYSCALL > LAST_ARCH_SYSCALL #define LAST_SYSCALL LAST_GENERIC_SYSCALL @@ -455,32 +485,37 @@ [ __NR_stat64 ] = sys_stat64, [ __NR_lstat64 ] = sys_lstat64, [ __NR_fstat64 ] = sys_fstat64, - [ __NR_fcntl64 ] = sys_fcntl64, [ __NR_getdents64 ] = sys_getdents64, + [ __NR_fcntl64 ] = sys_fcntl64, + [ 223 ] = sys_ni_syscall, [ __NR_gettid ] = sys_gettid, [ __NR_readahead ] = sys_readahead, - [ __NR_setxattr ] = sys_ni_syscall, - [ __NR_lsetxattr ] = sys_ni_syscall, - [ __NR_fsetxattr ] = sys_ni_syscall, - [ __NR_getxattr ] = sys_ni_syscall, - [ __NR_lgetxattr ] = sys_ni_syscall, - [ __NR_fgetxattr ] = sys_ni_syscall, - [ __NR_listxattr ] = sys_ni_syscall, - [ __NR_llistxattr ] = sys_ni_syscall, - [ __NR_flistxattr ] = sys_ni_syscall, - [ __NR_removexattr ] = sys_ni_syscall, - [ __NR_lremovexattr ] = sys_ni_syscall, - [ __NR_fremovexattr ] = sys_ni_syscall, + [ __NR_setxattr ] = sys_setxattr, + [ __NR_lsetxattr ] = sys_lsetxattr, + [ __NR_fsetxattr ] = sys_fsetxattr, + [ __NR_getxattr ] = sys_getxattr, + [ __NR_lgetxattr ] = sys_lgetxattr, + [ __NR_fgetxattr ] = sys_fgetxattr, + [ __NR_listxattr ] = sys_listxattr, + [ __NR_llistxattr ] = sys_llistxattr, + [ __NR_flistxattr ] = sys_flistxattr, + [ __NR_removexattr ] = sys_removexattr, + [ __NR_lremovexattr ] = sys_lremovexattr, + [ __NR_fremovexattr ] = sys_fremovexattr, [ __NR_tkill ] = sys_tkill, [ __NR_sendfile64 ] = sys_sendfile64, [ __NR_futex ] = sys_futex, [ __NR_sched_setaffinity ] = sys_sched_setaffinity, [ __NR_sched_getaffinity ] = sys_sched_getaffinity, + [ __NR_set_thread_area ] = sys_ni_syscall, + [ __NR_get_thread_area ] = sys_ni_syscall, [ __NR_io_setup ] = sys_io_setup, [ __NR_io_destroy ] = sys_io_destroy, [ __NR_io_getevents ] = sys_io_getevents, [ __NR_io_submit ] = sys_io_submit, [ __NR_io_cancel ] = sys_io_cancel, + [ __NR_fadvise64 ] = sys_fadvise64, + [ 251 ] = sys_ni_syscall, [ __NR_exit_group ] = sys_exit_group, [ __NR_lookup_dcookie ] = sys_lookup_dcookie, [ __NR_epoll_create ] = sys_epoll_create, @@ -488,6 +523,25 @@ [ __NR_epoll_wait ] = sys_epoll_wait, [ __NR_remap_file_pages ] = sys_remap_file_pages, [ __NR_set_tid_address ] = sys_set_tid_address, + [ __NR_timer_create ] = sys_timer_create, + [ __NR_timer_settime ] = sys_timer_settime, + [ __NR_timer_gettime ] = sys_timer_gettime, + [ __NR_timer_getoverrun ] = sys_timer_getoverrun, + [ __NR_timer_delete ] = sys_timer_delete, + [ __NR_clock_settime ] = sys_clock_settime, + [ __NR_clock_gettime ] = sys_clock_gettime, + [ __NR_clock_getres ] = sys_clock_getres, + [ __NR_clock_nanosleep ] = sys_clock_nanosleep, + [ __NR_statfs64 ] = sys_statfs64, + [ __NR_fstatfs64 ] = sys_fstatfs64, + [ __NR_tgkill ] = sys_tgkill, + [ __NR_utimes ] = sys_utimes, + [ __NR_fadvise64_64 ] = sys_fadvise64_64, +#ifdef CONFIG_REISER4_FS + [ __NR_reiser4 ] = sys_reiser4, +#else + [ __NR_reiser4 ] = sys_ni_syscall, +#endif ARCH_SYSCALLS [ LAST_SYSCALL + 1 ... NR_syscalls ] = diff -Nru a/arch/um/kernel/sys_call_table.c~uml-summa.diff b/arch/um/kernel/sys_call_table.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/sys_call_table.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,506 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/unistd.h" +#include "linux/version.h" +#include "linux/sys.h" +#include "linux/swap.h" +#include "linux/sysctl.h" +#include "asm/signal.h" +#include "sysdep/syscalls.h" +#include "kern_util.h" + +extern syscall_handler_t sys_restart_syscall; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_exit; +extern syscall_handler_t sys_fork; +extern syscall_handler_t sys_creat; +extern syscall_handler_t sys_link; +extern syscall_handler_t sys_unlink; +extern syscall_handler_t sys_chdir; +extern syscall_handler_t sys_mknod; +extern syscall_handler_t sys_chmod; +extern syscall_handler_t sys_lchown16; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_stat; +extern syscall_handler_t sys_getpid; +extern syscall_handler_t sys_oldumount; +extern syscall_handler_t sys_setuid16; +extern syscall_handler_t sys_getuid16; +extern syscall_handler_t sys_ptrace; +extern syscall_handler_t sys_alarm; +extern syscall_handler_t sys_fstat; +extern syscall_handler_t sys_pause; +extern syscall_handler_t sys_utime; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_access; +extern syscall_handler_t sys_nice; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_sync; +extern syscall_handler_t sys_kill; +extern syscall_handler_t sys_rename; +extern syscall_handler_t sys_mkdir; +extern syscall_handler_t sys_rmdir; +extern syscall_handler_t sys_pipe; +extern syscall_handler_t sys_times; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_brk; +extern syscall_handler_t sys_setgid16; +extern syscall_handler_t sys_getgid16; +extern syscall_handler_t sys_signal; +extern syscall_handler_t sys_geteuid16; +extern syscall_handler_t sys_getegid16; +extern syscall_handler_t sys_acct; +extern syscall_handler_t sys_umount; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_ioctl; +extern syscall_handler_t sys_fcntl; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_setpgid; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_olduname; +extern syscall_handler_t sys_umask; +extern syscall_handler_t sys_chroot; +extern syscall_handler_t sys_ustat; +extern syscall_handler_t sys_dup2; +extern syscall_handler_t sys_getppid; +extern syscall_handler_t sys_getpgrp; +extern syscall_handler_t sys_sigaction; +extern syscall_handler_t sys_sgetmask; +extern syscall_handler_t sys_ssetmask; +extern syscall_handler_t sys_setreuid16; +extern syscall_handler_t sys_setregid16; +extern syscall_handler_t sys_sigsuspend; +extern syscall_handler_t sys_sigpending; +extern syscall_handler_t sys_sethostname; +extern syscall_handler_t sys_setrlimit; +extern syscall_handler_t sys_old_getrlimit; +extern syscall_handler_t sys_getrusage; +extern syscall_handler_t sys_gettimeofday; +extern syscall_handler_t sys_settimeofday; +extern syscall_handler_t sys_getgroups16; +extern syscall_handler_t sys_setgroups16; +extern syscall_handler_t sys_symlink; +extern syscall_handler_t sys_lstat; +extern syscall_handler_t sys_readlink; +extern syscall_handler_t sys_swapon; +extern syscall_handler_t sys_uselib; +extern syscall_handler_t sys_reboot; +extern syscall_handler_t old_readdir; +extern syscall_handler_t sys_munmap; +extern syscall_handler_t sys_truncate; +extern syscall_handler_t sys_ftruncate; +extern syscall_handler_t sys_fchmod; +extern syscall_handler_t sys_fchown16; +extern syscall_handler_t sys_getpriority; +extern syscall_handler_t sys_setpriority; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_statfs; +extern syscall_handler_t sys_fstatfs; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_socketcall; +extern syscall_handler_t sys_syslog; +extern syscall_handler_t sys_setitimer; +extern syscall_handler_t sys_getitimer; +extern syscall_handler_t sys_newstat; +extern syscall_handler_t sys_newlstat; +extern syscall_handler_t sys_newfstat; +extern syscall_handler_t sys_uname; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_vhangup; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_swapoff; +extern syscall_handler_t sys_sysinfo; +extern syscall_handler_t sys_ipc; +extern syscall_handler_t sys_fsync; +extern syscall_handler_t sys_sigreturn; +extern syscall_handler_t sys_rt_sigreturn; +extern syscall_handler_t sys_clone; +extern syscall_handler_t sys_setdomainname; +extern syscall_handler_t sys_newuname; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_adjtimex; +extern syscall_handler_t sys_mprotect; +extern syscall_handler_t sys_sigprocmask; +extern syscall_handler_t sys_init_module; +extern syscall_handler_t sys_delete_module; +extern syscall_handler_t sys_quotactl; +extern syscall_handler_t sys_getpgid; +extern syscall_handler_t sys_fchdir; +extern syscall_handler_t sys_bdflush; +extern syscall_handler_t sys_sysfs; +extern syscall_handler_t sys_personality; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_setfsuid16; +extern syscall_handler_t sys_setfsgid16; +extern syscall_handler_t sys_llseek; +extern syscall_handler_t sys_getdents; +extern syscall_handler_t sys_flock; +extern syscall_handler_t sys_msync; +extern syscall_handler_t sys_readv; +extern syscall_handler_t sys_writev; +extern syscall_handler_t sys_getsid; +extern syscall_handler_t sys_fdatasync; +extern syscall_handler_t sys_mlock; +extern syscall_handler_t sys_munlock; +extern syscall_handler_t sys_mlockall; +extern syscall_handler_t sys_munlockall; +extern syscall_handler_t sys_sched_setparam; +extern syscall_handler_t sys_sched_getparam; +extern syscall_handler_t sys_sched_setscheduler; +extern syscall_handler_t sys_sched_getscheduler; +extern syscall_handler_t sys_sched_get_priority_max; +extern syscall_handler_t sys_sched_get_priority_min; +extern syscall_handler_t sys_sched_rr_get_interval; +extern syscall_handler_t sys_nanosleep; +extern syscall_handler_t sys_mremap; +extern syscall_handler_t sys_setresuid16; +extern syscall_handler_t sys_getresuid16; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_poll; +extern syscall_handler_t sys_nfsservctl; +extern syscall_handler_t sys_setresgid16; +extern syscall_handler_t sys_getresgid16; +extern syscall_handler_t sys_prctl; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_rt_sigaction; +extern syscall_handler_t sys_rt_sigprocmask; +extern syscall_handler_t sys_rt_sigpending; +extern syscall_handler_t sys_rt_sigtimedwait; +extern syscall_handler_t sys_rt_sigqueueinfo; +extern syscall_handler_t sys_rt_sigsuspend; +extern syscall_handler_t sys_pread64; +extern syscall_handler_t sys_pwrite64; +extern syscall_handler_t sys_chown16; +extern syscall_handler_t sys_getcwd; +extern syscall_handler_t sys_capget; +extern syscall_handler_t sys_capset; +extern syscall_handler_t sys_sigaltstack; +extern syscall_handler_t sys_sendfile; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_ni_syscall; +extern syscall_handler_t sys_vfork; +extern syscall_handler_t sys_getrlimit; +extern syscall_handler_t sys_mmap2; +extern syscall_handler_t sys_truncate64; +extern syscall_handler_t sys_ftruncate64; +extern syscall_handler_t sys_stat64; +extern syscall_handler_t sys_lstat64; +extern syscall_handler_t sys_fstat64; +extern syscall_handler_t sys_lchown; +extern syscall_handler_t sys_getuid; +extern syscall_handler_t sys_getgid; +extern syscall_handler_t sys_geteuid; +extern syscall_handler_t sys_getegid; +extern syscall_handler_t sys_setreuid; +extern syscall_handler_t sys_setregid; +extern syscall_handler_t sys_getgroups; +extern syscall_handler_t sys_setgroups; +extern syscall_handler_t sys_fchown; +extern syscall_handler_t sys_setresuid; +extern syscall_handler_t sys_getresuid; +extern syscall_handler_t sys_setresgid; +extern syscall_handler_t sys_getresgid; +extern syscall_handler_t sys_chown; +extern syscall_handler_t sys_setuid; +extern syscall_handler_t sys_setgid; +extern syscall_handler_t sys_setfsuid; +extern syscall_handler_t sys_setfsgid; +extern syscall_handler_t sys_pivot_root; +extern syscall_handler_t sys_mincore; +extern syscall_handler_t sys_madvise; +extern syscall_handler_t sys_fcntl64; +extern syscall_handler_t sys_getdents64; +extern syscall_handler_t sys_gettid; +extern syscall_handler_t sys_readahead; +extern syscall_handler_t sys_tkill; +extern syscall_handler_t sys_sendfile64; +extern syscall_handler_t sys_futex; +extern syscall_handler_t sys_sched_setaffinity; +extern syscall_handler_t sys_sched_getaffinity; +extern syscall_handler_t sys_io_setup; +extern syscall_handler_t sys_io_destroy; +extern syscall_handler_t sys_io_getevents; +extern syscall_handler_t sys_io_submit; +extern syscall_handler_t sys_io_cancel; +extern syscall_handler_t sys_exit_group; +extern syscall_handler_t sys_lookup_dcookie; +extern syscall_handler_t sys_epoll_create; +extern syscall_handler_t sys_epoll_ctl; +extern syscall_handler_t sys_epoll_wait; +extern syscall_handler_t sys_remap_file_pages; +extern syscall_handler_t sys_set_tid_address; + +#ifdef CONFIG_NFSD +#define NFSSERVCTL sys_nfsservctl +#else +#define NFSSERVCTL sys_ni_syscall +#endif + +extern syscall_handler_t um_mount; +extern syscall_handler_t um_time; +extern syscall_handler_t um_stime; + +#define LAST_GENERIC_SYSCALL __NR_set_tid_address + +#if LAST_GENERIC_SYSCALL > LAST_ARCH_SYSCALL +#define LAST_SYSCALL LAST_GENERIC_SYSCALL +#else +#define LAST_SYSCALL LAST_ARCH_SYSCALL +#endif + +syscall_handler_t *sys_call_table[] = { + [ __NR_restart_syscall ] = sys_restart_syscall, + [ __NR_exit ] = sys_exit, + [ __NR_fork ] = sys_fork, + [ __NR_read ] = (syscall_handler_t *) sys_read, + [ __NR_write ] = (syscall_handler_t *) sys_write, + + /* These three are declared differently in asm/unistd.h */ + [ __NR_open ] = (syscall_handler_t *) sys_open, + [ __NR_close ] = (syscall_handler_t *) sys_close, + [ __NR_waitpid ] = (syscall_handler_t *) sys_waitpid, + [ __NR_creat ] = sys_creat, + [ __NR_link ] = sys_link, + [ __NR_unlink ] = sys_unlink, + + /* declared differently in kern_util.h */ + [ __NR_execve ] = (syscall_handler_t *) sys_execve, + [ __NR_chdir ] = sys_chdir, + [ __NR_time ] = um_time, + [ __NR_mknod ] = sys_mknod, + [ __NR_chmod ] = sys_chmod, + [ __NR_lchown ] = sys_lchown16, + [ __NR_break ] = sys_ni_syscall, + [ __NR_oldstat ] = sys_stat, + [ __NR_lseek ] = (syscall_handler_t *) sys_lseek, + [ __NR_getpid ] = sys_getpid, + [ __NR_mount ] = um_mount, + [ __NR_umount ] = sys_oldumount, + [ __NR_setuid ] = sys_setuid16, + [ __NR_getuid ] = sys_getuid16, + [ __NR_stime ] = um_stime, + [ __NR_ptrace ] = sys_ptrace, + [ __NR_alarm ] = sys_alarm, + [ __NR_oldfstat ] = sys_fstat, + [ __NR_pause ] = sys_pause, + [ __NR_utime ] = sys_utime, + [ __NR_stty ] = sys_ni_syscall, + [ __NR_gtty ] = sys_ni_syscall, + [ __NR_access ] = sys_access, + [ __NR_nice ] = sys_nice, + [ __NR_ftime ] = sys_ni_syscall, + [ __NR_sync ] = sys_sync, + [ __NR_kill ] = sys_kill, + [ __NR_rename ] = sys_rename, + [ __NR_mkdir ] = sys_mkdir, + [ __NR_rmdir ] = sys_rmdir, + + /* Declared differently in asm/unistd.h */ + [ __NR_dup ] = (syscall_handler_t *) sys_dup, + [ __NR_pipe ] = sys_pipe, + [ __NR_times ] = sys_times, + [ __NR_prof ] = sys_ni_syscall, + [ __NR_brk ] = sys_brk, + [ __NR_setgid ] = sys_setgid16, + [ __NR_getgid ] = sys_getgid16, + [ __NR_signal ] = sys_signal, + [ __NR_geteuid ] = sys_geteuid16, + [ __NR_getegid ] = sys_getegid16, + [ __NR_acct ] = sys_acct, + [ __NR_umount2 ] = sys_umount, + [ __NR_lock ] = sys_ni_syscall, + [ __NR_ioctl ] = sys_ioctl, + [ __NR_fcntl ] = sys_fcntl, + [ __NR_mpx ] = sys_ni_syscall, + [ __NR_setpgid ] = sys_setpgid, + [ __NR_ulimit ] = sys_ni_syscall, + [ __NR_oldolduname ] = sys_olduname, + [ __NR_umask ] = sys_umask, + [ __NR_chroot ] = sys_chroot, + [ __NR_ustat ] = sys_ustat, + [ __NR_dup2 ] = sys_dup2, + [ __NR_getppid ] = sys_getppid, + [ __NR_getpgrp ] = sys_getpgrp, + [ __NR_setsid ] = (syscall_handler_t *) sys_setsid, + [ __NR_sigaction ] = sys_sigaction, + [ __NR_sgetmask ] = sys_sgetmask, + [ __NR_ssetmask ] = sys_ssetmask, + [ __NR_setreuid ] = sys_setreuid16, + [ __NR_setregid ] = sys_setregid16, + [ __NR_sigsuspend ] = sys_sigsuspend, + [ __NR_sigpending ] = sys_sigpending, + [ __NR_sethostname ] = sys_sethostname, + [ __NR_setrlimit ] = sys_setrlimit, + [ __NR_getrlimit ] = sys_old_getrlimit, + [ __NR_getrusage ] = sys_getrusage, + [ __NR_gettimeofday ] = sys_gettimeofday, + [ __NR_settimeofday ] = sys_settimeofday, + [ __NR_getgroups ] = sys_getgroups16, + [ __NR_setgroups ] = sys_setgroups16, + [ __NR_symlink ] = sys_symlink, + [ __NR_oldlstat ] = sys_lstat, + [ __NR_readlink ] = sys_readlink, + [ __NR_uselib ] = sys_uselib, + [ __NR_swapon ] = (syscall_handler_t *) sys_swapon, + [ __NR_reboot ] = sys_reboot, + [ __NR_readdir ] = old_readdir, + [ __NR_munmap ] = sys_munmap, + [ __NR_truncate ] = sys_truncate, + [ __NR_ftruncate ] = sys_ftruncate, + [ __NR_fchmod ] = sys_fchmod, + [ __NR_fchown ] = sys_fchown16, + [ __NR_getpriority ] = sys_getpriority, + [ __NR_setpriority ] = sys_setpriority, + [ __NR_profil ] = sys_ni_syscall, + [ __NR_statfs ] = sys_statfs, + [ __NR_fstatfs ] = sys_fstatfs, + [ __NR_ioperm ] = sys_ni_syscall, + [ __NR_socketcall ] = sys_socketcall, + [ __NR_syslog ] = sys_syslog, + [ __NR_setitimer ] = sys_setitimer, + [ __NR_getitimer ] = sys_getitimer, + [ __NR_stat ] = sys_newstat, + [ __NR_lstat ] = sys_newlstat, + [ __NR_fstat ] = sys_newfstat, + [ __NR_olduname ] = sys_uname, + [ __NR_iopl ] = sys_ni_syscall, + [ __NR_vhangup ] = sys_vhangup, + [ __NR_idle ] = sys_ni_syscall, + [ __NR_wait4 ] = (syscall_handler_t *) sys_wait4, + [ __NR_swapoff ] = (syscall_handler_t *) sys_swapoff, + [ __NR_sysinfo ] = sys_sysinfo, + [ __NR_ipc ] = sys_ipc, + [ __NR_fsync ] = sys_fsync, + [ __NR_sigreturn ] = sys_sigreturn, + [ __NR_clone ] = sys_clone, + [ __NR_setdomainname ] = sys_setdomainname, + [ __NR_uname ] = sys_newuname, + [ __NR_adjtimex ] = sys_adjtimex, + [ __NR_mprotect ] = sys_mprotect, + [ __NR_sigprocmask ] = sys_sigprocmask, + [ __NR_create_module ] = sys_ni_syscall, + [ __NR_init_module ] = sys_init_module, + [ __NR_delete_module ] = sys_delete_module, + [ __NR_get_kernel_syms ] = sys_ni_syscall, + [ __NR_quotactl ] = sys_quotactl, + [ __NR_getpgid ] = sys_getpgid, + [ __NR_fchdir ] = sys_fchdir, + [ __NR_bdflush ] = sys_bdflush, + [ __NR_sysfs ] = sys_sysfs, + [ __NR_personality ] = sys_personality, + [ __NR_afs_syscall ] = sys_ni_syscall, + [ __NR_setfsuid ] = sys_setfsuid16, + [ __NR_setfsgid ] = sys_setfsgid16, + [ __NR__llseek ] = sys_llseek, + [ __NR_getdents ] = sys_getdents, + [ __NR__newselect ] = (syscall_handler_t *) sys_select, + [ __NR_flock ] = sys_flock, + [ __NR_msync ] = sys_msync, + [ __NR_readv ] = sys_readv, + [ __NR_writev ] = sys_writev, + [ __NR_getsid ] = sys_getsid, + [ __NR_fdatasync ] = sys_fdatasync, + [ __NR__sysctl ] = (syscall_handler_t *) sys_sysctl, + [ __NR_mlock ] = sys_mlock, + [ __NR_munlock ] = sys_munlock, + [ __NR_mlockall ] = sys_mlockall, + [ __NR_munlockall ] = sys_munlockall, + [ __NR_sched_setparam ] = sys_sched_setparam, + [ __NR_sched_getparam ] = sys_sched_getparam, + [ __NR_sched_setscheduler ] = sys_sched_setscheduler, + [ __NR_sched_getscheduler ] = sys_sched_getscheduler, + [ __NR_sched_yield ] = (syscall_handler_t *) yield, + [ __NR_sched_get_priority_max ] = sys_sched_get_priority_max, + [ __NR_sched_get_priority_min ] = sys_sched_get_priority_min, + [ __NR_sched_rr_get_interval ] = sys_sched_rr_get_interval, + [ __NR_nanosleep ] = sys_nanosleep, + [ __NR_mremap ] = sys_mremap, + [ __NR_setresuid ] = sys_setresuid16, + [ __NR_getresuid ] = sys_getresuid16, + [ __NR_vm86 ] = sys_ni_syscall, + [ __NR_query_module ] = sys_ni_syscall, + [ __NR_poll ] = sys_poll, + [ __NR_nfsservctl ] = NFSSERVCTL, + [ __NR_setresgid ] = sys_setresgid16, + [ __NR_getresgid ] = sys_getresgid16, + [ __NR_prctl ] = sys_prctl, + [ __NR_rt_sigreturn ] = sys_rt_sigreturn, + [ __NR_rt_sigaction ] = sys_rt_sigaction, + [ __NR_rt_sigprocmask ] = sys_rt_sigprocmask, + [ __NR_rt_sigpending ] = sys_rt_sigpending, + [ __NR_rt_sigtimedwait ] = sys_rt_sigtimedwait, + [ __NR_rt_sigqueueinfo ] = sys_rt_sigqueueinfo, + [ __NR_rt_sigsuspend ] = sys_rt_sigsuspend, + [ __NR_pread64 ] = sys_pread64, + [ __NR_pwrite64 ] = sys_pwrite64, + [ __NR_chown ] = sys_chown16, + [ __NR_getcwd ] = sys_getcwd, + [ __NR_capget ] = sys_capget, + [ __NR_capset ] = sys_capset, + [ __NR_sigaltstack ] = sys_sigaltstack, + [ __NR_sendfile ] = sys_sendfile, + [ __NR_getpmsg ] = sys_ni_syscall, + [ __NR_putpmsg ] = sys_ni_syscall, + [ __NR_vfork ] = sys_vfork, + [ __NR_ugetrlimit ] = sys_getrlimit, + [ __NR_mmap2 ] = sys_mmap2, + [ __NR_truncate64 ] = sys_truncate64, + [ __NR_ftruncate64 ] = sys_ftruncate64, + [ __NR_stat64 ] = sys_stat64, + [ __NR_lstat64 ] = sys_lstat64, + [ __NR_fstat64 ] = sys_fstat64, + [ __NR_fcntl64 ] = sys_fcntl64, + [ __NR_getdents64 ] = sys_getdents64, + [ __NR_gettid ] = sys_gettid, + [ __NR_readahead ] = sys_readahead, + [ __NR_setxattr ] = sys_ni_syscall, + [ __NR_lsetxattr ] = sys_ni_syscall, + [ __NR_fsetxattr ] = sys_ni_syscall, + [ __NR_getxattr ] = sys_ni_syscall, + [ __NR_lgetxattr ] = sys_ni_syscall, + [ __NR_fgetxattr ] = sys_ni_syscall, + [ __NR_listxattr ] = sys_ni_syscall, + [ __NR_llistxattr ] = sys_ni_syscall, + [ __NR_flistxattr ] = sys_ni_syscall, + [ __NR_removexattr ] = sys_ni_syscall, + [ __NR_lremovexattr ] = sys_ni_syscall, + [ __NR_fremovexattr ] = sys_ni_syscall, + [ __NR_tkill ] = sys_tkill, + [ __NR_sendfile64 ] = sys_sendfile64, + [ __NR_futex ] = sys_futex, + [ __NR_sched_setaffinity ] = sys_sched_setaffinity, + [ __NR_sched_getaffinity ] = sys_sched_getaffinity, + [ __NR_io_setup ] = sys_io_setup, + [ __NR_io_destroy ] = sys_io_destroy, + [ __NR_io_getevents ] = sys_io_getevents, + [ __NR_io_submit ] = sys_io_submit, + [ __NR_io_cancel ] = sys_io_cancel, + [ __NR_exit_group ] = sys_exit_group, + [ __NR_lookup_dcookie ] = sys_lookup_dcookie, + [ __NR_epoll_create ] = sys_epoll_create, + [ __NR_epoll_ctl ] = sys_epoll_ctl, + [ __NR_epoll_wait ] = sys_epoll_wait, + [ __NR_remap_file_pages ] = sys_remap_file_pages, + [ __NR_set_tid_address ] = sys_set_tid_address, + + ARCH_SYSCALLS + [ LAST_SYSCALL + 1 ... NR_syscalls ] = + (syscall_handler_t *) sys_ni_syscall +}; + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/syscall_kern.c b/arch/um/kernel/syscall_kern.c --- a/arch/um/kernel/syscall_kern.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/syscall_kern.c Fri Oct 31 14:10:54 2003 @@ -35,39 +35,40 @@ long sys_fork(void) { - struct task_struct *p; + long ret; current->thread.forking = 1; - p = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL); + ret = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL); current->thread.forking = 0; - return(IS_ERR(p) ? PTR_ERR(p) : p->pid); + return(ret); } -long sys_clone(unsigned long clone_flags, unsigned long newsp) +long sys_clone(unsigned long clone_flags, unsigned long newsp, + int *parent_tid, int *child_tid) { - struct task_struct *p; + long ret; current->thread.forking = 1; - p = do_fork(clone_flags, newsp, NULL, 0, NULL, NULL); + ret = do_fork(clone_flags, newsp, NULL, 0, parent_tid, child_tid); current->thread.forking = 0; - return(IS_ERR(p) ? PTR_ERR(p) : p->pid); + return(ret); } long sys_vfork(void) { - struct task_struct *p; + long ret; current->thread.forking = 1; - p = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, NULL); + ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, + NULL); current->thread.forking = 0; - return(IS_ERR(p) ? PTR_ERR(p) : p->pid); + return(ret); } /* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) +long do_mmap2(struct mm_struct *mm, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, unsigned long fd, + unsigned long pgoff) { int error = -EBADF; struct file * file = NULL; @@ -79,9 +80,9 @@ goto out; } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + error = do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff); + up_write(&mm->mmap_sem); if (file) fput(file); @@ -93,7 +94,7 @@ unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff); } /* @@ -120,7 +121,8 @@ if (offset & ~PAGE_MASK) goto out; - err = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + err = do_mmap2(current->mm, addr, len, prot, flags, fd, + offset >> PAGE_SHIFT); out: return err; } @@ -141,37 +143,6 @@ return error; } -int sys_sigaction(int sig, const struct old_sigaction *act, - struct old_sigaction *oact) -{ - struct k_sigaction new_ka, old_ka; - int ret; - - if (act) { - old_sigset_t mask; - if (verify_area(VERIFY_READ, act, sizeof(*act)) || - __get_user(new_ka.sa.sa_handler, &act->sa_handler) || - __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) - return -EFAULT; - __get_user(new_ka.sa.sa_flags, &act->sa_flags); - __get_user(mask, &act->sa_mask); - siginitset(&new_ka.sa.sa_mask, mask); - } - - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); - - if (!ret && oact) { - if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || - __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) - return -EFAULT; - __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); - } - - return ret; -} - /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. * @@ -253,7 +224,7 @@ return sys_shmctl (first, second, (struct shmid_ds *) ptr); default: - return -EINVAL; + return -ENOSYS; } } @@ -300,11 +271,6 @@ error = error ? -EFAULT : 0; return error; -} - -int sys_sigaltstack(const stack_t *uss, stack_t *uoss) -{ - return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); } long execute_syscall(void *r) diff -Nru a/arch/um/kernel/syscall_kern.c~uml-summa.diff b/arch/um/kernel/syscall_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/syscall_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,340 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/file.h" +#include "linux/smp_lock.h" +#include "linux/mm.h" +#include "linux/utsname.h" +#include "linux/msg.h" +#include "linux/shm.h" +#include "linux/sys.h" +#include "linux/unistd.h" +#include "linux/slab.h" +#include "linux/utime.h" +#include "asm/mman.h" +#include "asm/uaccess.h" +#include "asm/ipc.h" +#include "kern_util.h" +#include "user_util.h" +#include "sysdep/syscalls.h" +#include "mode_kern.h" +#include "choose-mode.h" + +/* Unlocked, I don't care if this is a bit off */ +int nsyscalls = 0; + +long um_mount(char * dev_name, char * dir_name, char * type, + unsigned long new_flags, void * data) +{ + if(type == NULL) type = ""; + return(sys_mount(dev_name, dir_name, type, new_flags, data)); +} + +long sys_fork(void) +{ + struct task_struct *p; + + current->thread.forking = 1; + p = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL); + current->thread.forking = 0; + return(IS_ERR(p) ? PTR_ERR(p) : p->pid); +} + +long sys_clone(unsigned long clone_flags, unsigned long newsp) +{ + struct task_struct *p; + + current->thread.forking = 1; + p = do_fork(clone_flags, newsp, NULL, 0, NULL, NULL); + current->thread.forking = 0; + return(IS_ERR(p) ? PTR_ERR(p) : p->pid); +} + +long sys_vfork(void) +{ + struct task_struct *p; + + current->thread.forking = 1; + p = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, NULL); + current->thread.forking = 0; + return(IS_ERR(p) ? PTR_ERR(p) : p->pid); +} + +/* common code for old and new mmaps */ +static inline long do_mmap2( + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + int error = -EBADF; + struct file * file = NULL; + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + down_write(¤t->mm->mmap_sem); + error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); + out: + return error; +} + +long sys_mmap2(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) +{ + return do_mmap2(addr, len, prot, flags, fd, pgoff); +} + +/* + * Perform the select(nd, in, out, ex, tv) and mmap() system + * calls. Linux/i386 didn't use to be able to handle more than + * 4 system call parameters, so these system calls used a memory + * block for parameter passing.. + */ + +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +int old_mmap(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long offset) +{ + int err = -EINVAL; + if (offset & ~PAGE_MASK) + goto out; + + err = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + out: + return err; +} +/* + * sys_pipe() is the normal C calling standard for creating + * a pipe. It's not the way unix traditionally does this, though. + */ +int sys_pipe(unsigned long * fildes) +{ + int fd[2]; + int error; + + error = do_pipe(fd); + if (!error) { + if (copy_to_user(fildes, fd, 2*sizeof(int))) + error = -EFAULT; + } + return error; +} + +int sys_sigaction(int sig, const struct old_sigaction *act, + struct old_sigaction *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (verify_area(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} + +/* + * sys_ipc() is the de-multiplexer for the SysV IPC calls.. + * + * This is really horribly ugly. + */ +int sys_ipc (uint call, int first, int second, + int third, void *ptr, long fifth) +{ + int version, ret; + + version = call >> 16; /* hack for backward compatibility */ + call &= 0xffff; + + switch (call) { + case SEMOP: + return sys_semtimedop(first, (struct sembuf *) ptr, second, + NULL); + case SEMTIMEDOP: + return sys_semtimedop(first, (struct sembuf *) ptr, second, + (const struct timespec *) fifth); + case SEMGET: + return sys_semget (first, second, third); + case SEMCTL: { + union semun fourth; + if (!ptr) + return -EINVAL; + if (get_user(fourth.__pad, (void **) ptr)) + return -EFAULT; + return sys_semctl (first, second, third, fourth); + } + + case MSGSND: + return sys_msgsnd (first, (struct msgbuf *) ptr, + second, third); + case MSGRCV: + switch (version) { + case 0: { + struct ipc_kludge tmp; + if (!ptr) + return -EINVAL; + + if (copy_from_user(&tmp, + (struct ipc_kludge *) ptr, + sizeof (tmp))) + return -EFAULT; + return sys_msgrcv (first, tmp.msgp, second, + tmp.msgtyp, third); + } + default: + panic("msgrcv with version != 0"); + return sys_msgrcv (first, + (struct msgbuf *) ptr, + second, fifth, third); + } + case MSGGET: + return sys_msgget ((key_t) first, second); + case MSGCTL: + return sys_msgctl (first, second, (struct msqid_ds *) ptr); + + case SHMAT: + switch (version) { + default: { + ulong raddr; + ret = sys_shmat (first, (char *) ptr, second, &raddr); + if (ret) + return ret; + return put_user (raddr, (ulong *) third); + } + case 1: /* iBCS2 emulator entry point */ + if (!segment_eq(get_fs(), get_ds())) + return -EINVAL; + return sys_shmat (first, (char *) ptr, second, (ulong *) third); + } + case SHMDT: + return sys_shmdt ((char *)ptr); + case SHMGET: + return sys_shmget (first, second, third); + case SHMCTL: + return sys_shmctl (first, second, + (struct shmid_ds *) ptr); + default: + return -EINVAL; + } +} + +int sys_uname(struct old_utsname * name) +{ + int err; + if (!name) + return -EFAULT; + down_read(&uts_sem); + err=copy_to_user(name, &system_utsname, sizeof (*name)); + up_read(&uts_sem); + return err?-EFAULT:0; +} + +int sys_olduname(struct oldold_utsname * name) +{ + int error; + + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) + return -EFAULT; + + down_read(&uts_sem); + + error = __copy_to_user(&name->sysname,&system_utsname.sysname, + __OLD_UTS_LEN); + error |= __put_user(0,name->sysname+__OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename,&system_utsname.nodename, + __OLD_UTS_LEN); + error |= __put_user(0,name->nodename+__OLD_UTS_LEN); + error |= __copy_to_user(&name->release,&system_utsname.release, + __OLD_UTS_LEN); + error |= __put_user(0,name->release+__OLD_UTS_LEN); + error |= __copy_to_user(&name->version,&system_utsname.version, + __OLD_UTS_LEN); + error |= __put_user(0,name->version+__OLD_UTS_LEN); + error |= __copy_to_user(&name->machine,&system_utsname.machine, + __OLD_UTS_LEN); + error |= __put_user(0,name->machine+__OLD_UTS_LEN); + + up_read(&uts_sem); + + error = error ? -EFAULT : 0; + + return error; +} + +int sys_sigaltstack(const stack_t *uss, stack_t *uoss) +{ + return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); +} + +long execute_syscall(void *r) +{ + return(CHOOSE_MODE_PROC(execute_syscall_tt, execute_syscall_skas, r)); +} + +spinlock_t syscall_lock = SPIN_LOCK_UNLOCKED; + +static int syscall_index = 0; + +int next_syscall_index(int limit) +{ + int ret; + + spin_lock(&syscall_lock); + ret = syscall_index; + if(++syscall_index == limit) + syscall_index = 0; + spin_unlock(&syscall_lock); + return(ret); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c --- a/arch/um/kernel/sysrq.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/sysrq.c Fri Oct 31 14:10:53 2003 @@ -55,6 +55,14 @@ show_trace((unsigned long *)esp); } +void show_stack(struct task_struct *task, unsigned long *sp) +{ + if(task) + show_trace_task(task); + else + show_trace(sp); +} + /* * Overrides for Emacs so that we follow Linus's tabbing style. * Emacs will notice this stuff at the end of the file and automatically diff -Nru a/arch/um/kernel/sysrq.c~uml-summa.diff b/arch/um/kernel/sysrq.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/sysrq.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/kernel.h" +#include "linux/module.h" +#include "asm/page.h" +#include "asm/processor.h" +#include "sysrq.h" +#include "user_util.h" + +void show_trace(unsigned long * stack) +{ + int i; + unsigned long addr; + + if (!stack) + stack = (unsigned long*) &stack; + + printk("Call Trace: "); + i = 1; + while (((long) stack & (THREAD_SIZE-1)) != 0) { + addr = *stack++; + if (kernel_text_address(addr)) { + if (i && ((i % 6) == 0)) + printk("\n "); + printk("[<%08lx>] ", addr); + i++; + } + } + printk("\n"); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long stack; + + show_trace(&stack); +} + +EXPORT_SYMBOL(dump_stack); + +void show_trace_task(struct task_struct *tsk) +{ + unsigned long esp = PT_REGS_SP(&tsk->thread.regs); + + /* User space on another CPU? */ + if ((esp ^ (unsigned long)tsk) & (PAGE_MASK<<1)) + return; + show_trace((unsigned long *)esp); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/time.c b/arch/um/kernel/time.c --- a/arch/um/kernel/time.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/time.c Fri Oct 31 14:10:53 2003 @@ -16,12 +16,16 @@ #include "process.h" #include "signal_user.h" #include "time_user.h" +#include "kern_constants.h" extern struct timeval xtime; +struct timeval local_offset = { 0, 0 }; + void timer(void) { gettimeofday(&xtime, NULL); + timeradd(&xtime, &local_offset, &xtime); } void set_interval(int timer_type) @@ -66,7 +70,7 @@ errno); } -void idle_timer(void) +void uml_idle_timer(void) { if(signal(SIGVTALRM, SIG_IGN) == SIG_ERR) panic("Couldn't unset SIGVTALRM handler"); @@ -83,8 +87,6 @@ set_interval(ITIMER_VIRTUAL); } -struct timeval local_offset = { 0, 0 }; - void do_gettimeofday(struct timeval *tv) { unsigned long flags; @@ -95,15 +97,13 @@ time_unlock(flags); } -EXPORT_SYMBOL(do_gettimeofday); - int do_settimeofday(struct timespec *tv) { struct timeval now; unsigned long flags; struct timeval tv_in; - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + if ((unsigned long) tv->tv_nsec >= UM_NSEC_PER_SEC) return -EINVAL; tv_in.tv_sec = tv->tv_sec; @@ -113,9 +113,9 @@ gettimeofday(&now, NULL); timersub(&tv_in, &now, &local_offset); time_unlock(flags); -} -EXPORT_SYMBOL(do_settimeofday); + return(0); +} void idle_sleep(int secs) { diff -Nru a/arch/um/kernel/time.c~uml-export-in-ksyms.c.diff b/arch/um/kernel/time.c~uml-export-in-ksyms.c.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/time.c~uml-export-in-ksyms.c.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <unistd.h> +#include <time.h> +#include <sys/time.h> +#include <signal.h> +#include <errno.h> +#include "linux/module.h" +#include "user_util.h" +#include "kern_util.h" +#include "user.h" +#include "process.h" +#include "signal_user.h" +#include "time_user.h" +#include "kern_constants.h" + +extern struct timeval xtime; + +struct timeval local_offset = { 0, 0 }; + +void timer(void) +{ + gettimeofday(&xtime, NULL); + timeradd(&xtime, &local_offset, &xtime); +} + +void set_interval(int timer_type) +{ + int usec = 1000000/hz(); + struct itimerval interval = ((struct itimerval) { { 0, usec }, + { 0, usec } }); + + if(setitimer(timer_type, &interval, NULL) == -1) + panic("setitimer failed - errno = %d\n", errno); +} + +void enable_timer(void) +{ + int usec = 1000000/hz(); + struct itimerval enable = ((struct itimerval) { { 0, usec }, + { 0, usec }}); + if(setitimer(ITIMER_VIRTUAL, &enable, NULL)) + printk("enable_timer - setitimer failed, errno = %d\n", + errno); +} + +void switch_timers(int to_real) +{ + struct itimerval disable = ((struct itimerval) { { 0, 0 }, { 0, 0 }}); + struct itimerval enable = ((struct itimerval) { { 0, 1000000/hz() }, + { 0, 1000000/hz() }}); + int old, new; + + if(to_real){ + old = ITIMER_VIRTUAL; + new = ITIMER_REAL; + } + else { + old = ITIMER_REAL; + new = ITIMER_VIRTUAL; + } + + if((setitimer(old, &disable, NULL) < 0) || + (setitimer(new, &enable, NULL))) + printk("switch_timers - setitimer failed, errno = %d\n", + errno); +} + +void uml_idle_timer(void) +{ + if(signal(SIGVTALRM, SIG_IGN) == SIG_ERR) + panic("Couldn't unset SIGVTALRM handler"); + + set_handler(SIGALRM, (__sighandler_t) alarm_handler, + SA_RESTART, SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1); + set_interval(ITIMER_REAL); +} + +void time_init(void) +{ + if(signal(SIGVTALRM, boot_timer_handler) == SIG_ERR) + panic("Couldn't set SIGVTALRM handler"); + set_interval(ITIMER_VIRTUAL); +} + +void do_gettimeofday(struct timeval *tv) +{ + unsigned long flags; + + flags = time_lock(); + gettimeofday(tv, NULL); + timeradd(tv, &local_offset, tv); + time_unlock(flags); +} + +EXPORT_SYMBOL(do_gettimeofday); + +int do_settimeofday(struct timespec *tv) +{ + struct timeval now; + unsigned long flags; + struct timeval tv_in; + + if ((unsigned long) tv->tv_nsec >= UM_NSEC_PER_SEC) + return -EINVAL; + + tv_in.tv_sec = tv->tv_sec; + tv_in.tv_usec = tv->tv_nsec / 1000; + + flags = time_lock(); + gettimeofday(&now, NULL); + timersub(&tv_in, &now, &local_offset); + time_unlock(flags); + + return(0); +} + +EXPORT_SYMBOL(do_settimeofday); + +void idle_sleep(int secs) +{ + struct timespec ts; + + ts.tv_sec = secs; + ts.tv_nsec = 0; + nanosleep(&ts, NULL); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/time.c~uml-summa.diff b/arch/um/kernel/time.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/time.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <unistd.h> +#include <time.h> +#include <sys/time.h> +#include <signal.h> +#include <errno.h> +#include "linux/module.h" +#include "user_util.h" +#include "kern_util.h" +#include "user.h" +#include "process.h" +#include "signal_user.h" +#include "time_user.h" + +extern struct timeval xtime; + +void timer(void) +{ + gettimeofday(&xtime, NULL); +} + +void set_interval(int timer_type) +{ + int usec = 1000000/hz(); + struct itimerval interval = ((struct itimerval) { { 0, usec }, + { 0, usec } }); + + if(setitimer(timer_type, &interval, NULL) == -1) + panic("setitimer failed - errno = %d\n", errno); +} + +void enable_timer(void) +{ + int usec = 1000000/hz(); + struct itimerval enable = ((struct itimerval) { { 0, usec }, + { 0, usec }}); + if(setitimer(ITIMER_VIRTUAL, &enable, NULL)) + printk("enable_timer - setitimer failed, errno = %d\n", + errno); +} + +void switch_timers(int to_real) +{ + struct itimerval disable = ((struct itimerval) { { 0, 0 }, { 0, 0 }}); + struct itimerval enable = ((struct itimerval) { { 0, 1000000/hz() }, + { 0, 1000000/hz() }}); + int old, new; + + if(to_real){ + old = ITIMER_VIRTUAL; + new = ITIMER_REAL; + } + else { + old = ITIMER_REAL; + new = ITIMER_VIRTUAL; + } + + if((setitimer(old, &disable, NULL) < 0) || + (setitimer(new, &enable, NULL))) + printk("switch_timers - setitimer failed, errno = %d\n", + errno); +} + +void idle_timer(void) +{ + if(signal(SIGVTALRM, SIG_IGN) == SIG_ERR) + panic("Couldn't unset SIGVTALRM handler"); + + set_handler(SIGALRM, (__sighandler_t) alarm_handler, + SA_RESTART, SIGUSR1, SIGIO, SIGWINCH, SIGVTALRM, -1); + set_interval(ITIMER_REAL); +} + +void time_init(void) +{ + if(signal(SIGVTALRM, boot_timer_handler) == SIG_ERR) + panic("Couldn't set SIGVTALRM handler"); + set_interval(ITIMER_VIRTUAL); +} + +struct timeval local_offset = { 0, 0 }; + +void do_gettimeofday(struct timeval *tv) +{ + unsigned long flags; + + flags = time_lock(); + gettimeofday(tv, NULL); + timeradd(tv, &local_offset, tv); + time_unlock(flags); +} + +EXPORT_SYMBOL(do_gettimeofday); + +int do_settimeofday(struct timespec *tv) +{ + struct timeval now; + unsigned long flags; + struct timeval tv_in; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + tv_in.tv_sec = tv->tv_sec; + tv_in.tv_usec = tv->tv_nsec / 1000; + + flags = time_lock(); + gettimeofday(&now, NULL); + timersub(&tv_in, &now, &local_offset); + time_unlock(flags); +} + +EXPORT_SYMBOL(do_settimeofday); + +void idle_sleep(int secs) +{ + struct timespec ts; + + ts.tv_sec = secs; + ts.tv_nsec = 0; + nanosleep(&ts, NULL); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/time_kern.c b/arch/um/kernel/time_kern.c --- a/arch/um/kernel/time_kern.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/time_kern.c Fri Oct 31 14:10:54 2003 @@ -30,18 +30,26 @@ return(HZ); } +/* + * Scheduler clock - returns current time in nanosec units. + */ +unsigned long long sched_clock(void) +{ + return (unsigned long long)jiffies_64 * (1000000000 / HZ); +} + /* Changed at early boot */ int timer_irq_inited = 0; -/* missed_ticks will be modified after kernel memory has been - * write-protected, so this puts it in a section which will be left +/* missed_ticks will be modified after kernel memory has been + * write-protected, so this puts it in a section which will be left * write-enabled. */ int __attribute__ ((__section__ (".unprotected"))) missed_ticks[NR_CPUS]; void timer_irq(union uml_pt_regs *regs) { - int cpu = current->thread_info->cpu, ticks = missed_ticks[cpu]; + int cpu = current_thread->cpu, ticks = missed_ticks[cpu]; if(!timer_irq_inited) return; missed_ticks[cpu] = 0; @@ -58,12 +66,13 @@ do_timer(®s); } -void um_timer(int irq, void *dev, struct pt_regs *regs) +irqreturn_t um_timer(int irq, void *dev, struct pt_regs *regs) { do_timer(regs); - write_seqlock(&xtime_lock); + write_seqlock_irq(&xtime_lock); timer(); - write_sequnlock(&xtime_lock); + write_sequnlock_irq(&xtime_lock); + return(IRQ_HANDLED); } long um_time(int * tloc) @@ -81,12 +90,12 @@ long um_stime(int * tptr) { int value; - struct timeval new; + struct timespec new; if (get_user(value, tptr)) return -EFAULT; new.tv_sec = value; - new.tv_usec = 0; + new.tv_nsec = 0; do_settimeofday(&new); return 0; } @@ -125,9 +134,11 @@ void timer_handler(int sig, union uml_pt_regs *regs) { #ifdef CONFIG_SMP + local_irq_disable(); update_process_times(user_context(UPT_SP(regs))); + local_irq_enable(); #endif - if(current->thread_info->cpu == 0) + if(current_thread->cpu == 0) timer_irq(regs); } diff -Nru a/arch/um/kernel/time_kern.c~uml-sched_clock.diff b/arch/um/kernel/time_kern.c~uml-sched_clock.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/time_kern.c~uml-sched_clock.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/module.h" +#include "linux/unistd.h" +#include "linux/stddef.h" +#include "linux/spinlock.h" +#include "linux/time.h" +#include "linux/sched.h" +#include "linux/interrupt.h" +#include "linux/init.h" +#include "linux/delay.h" +#include "asm/irq.h" +#include "asm/param.h" +#include "asm/current.h" +#include "kern_util.h" +#include "user_util.h" +#include "time_user.h" +#include "mode.h" + +u64 jiffies_64; + +EXPORT_SYMBOL(jiffies_64); + +int hz(void) +{ + return(HZ); +} + +/* Changed at early boot */ +int timer_irq_inited = 0; + +/* missed_ticks will be modified after kernel memory has been + * write-protected, so this puts it in a section which will be left + * write-enabled. + */ +int __attribute__ ((__section__ (".unprotected"))) missed_ticks[NR_CPUS]; + +void timer_irq(union uml_pt_regs *regs) +{ + int cpu = current_thread->cpu, ticks = missed_ticks[cpu]; + + if(!timer_irq_inited) return; + missed_ticks[cpu] = 0; + while(ticks--) do_IRQ(TIMER_IRQ, regs); +} + +void boot_timer_handler(int sig) +{ + struct pt_regs regs; + + CHOOSE_MODE((void) + (UPT_SC(®s.regs) = (struct sigcontext *) (&sig + 1)), + (void) (regs.regs.skas.is_user = 0)); + do_timer(®s); +} + +irqreturn_t um_timer(int irq, void *dev, struct pt_regs *regs) +{ + do_timer(regs); + write_seqlock_irq(&xtime_lock); + timer(); + write_sequnlock_irq(&xtime_lock); + return(IRQ_HANDLED); +} + +long um_time(int * tloc) +{ + struct timeval now; + + do_gettimeofday(&now); + if (tloc) { + if (put_user(now.tv_sec,tloc)) + now.tv_sec = -EFAULT; + } + return now.tv_sec; +} + +long um_stime(int * tptr) +{ + int value; + struct timespec new; + + if (get_user(value, tptr)) + return -EFAULT; + new.tv_sec = value; + new.tv_nsec = 0; + do_settimeofday(&new); + return 0; +} + +/* XXX Needs to be moved under sys-i386 */ +void __delay(um_udelay_t time) +{ + /* Stolen from the i386 __loop_delay */ + int d0; + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (time)); +} + +void __udelay(um_udelay_t usecs) +{ + int i, n; + + n = (loops_per_jiffy * HZ * usecs) / 1000000; + for(i=0;i<n;i++) ; +} + +void __const_udelay(um_udelay_t usecs) +{ + int i, n; + + n = (loops_per_jiffy * HZ * usecs) / 1000000; + for(i=0;i<n;i++) ; +} + +void timer_handler(int sig, union uml_pt_regs *regs) +{ +#ifdef CONFIG_SMP + local_irq_disable(); + update_process_times(user_context(UPT_SP(regs))); + local_irq_enable(); +#endif + if(current_thread->cpu == 0) + timer_irq(regs); +} + +static spinlock_t timer_spinlock = SPIN_LOCK_UNLOCKED; + +unsigned long time_lock(void) +{ + unsigned long flags; + spin_lock_irqsave(&timer_spinlock, flags); + return(flags); +} + +void time_unlock(unsigned long flags) +{ + spin_unlock_irqrestore(&timer_spinlock, flags); +} + +int __init timer_init(void) +{ + int err; + + CHOOSE_MODE(user_time_init_tt(), user_time_init_skas()); + if((err = request_irq(TIMER_IRQ, um_timer, SA_INTERRUPT, "timer", + NULL)) != 0) + printk(KERN_ERR "timer_init : request_irq failed - " + "errno = %d\n", -err); + timer_irq_inited = 1; + return(0); +} + +__initcall(timer_init); + + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/time_kern.c~uml-summa.diff b/arch/um/kernel/time_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/time_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,173 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/module.h" +#include "linux/unistd.h" +#include "linux/stddef.h" +#include "linux/spinlock.h" +#include "linux/time.h" +#include "linux/sched.h" +#include "linux/interrupt.h" +#include "linux/init.h" +#include "linux/delay.h" +#include "asm/irq.h" +#include "asm/param.h" +#include "asm/current.h" +#include "kern_util.h" +#include "user_util.h" +#include "time_user.h" +#include "mode.h" + +u64 jiffies_64; + +EXPORT_SYMBOL(jiffies_64); + +int hz(void) +{ + return(HZ); +} + +/* Changed at early boot */ +int timer_irq_inited = 0; + +/* missed_ticks will be modified after kernel memory has been + * write-protected, so this puts it in a section which will be left + * write-enabled. + */ +int __attribute__ ((__section__ (".unprotected"))) missed_ticks[NR_CPUS]; + +void timer_irq(union uml_pt_regs *regs) +{ + int cpu = current->thread_info->cpu, ticks = missed_ticks[cpu]; + + if(!timer_irq_inited) return; + missed_ticks[cpu] = 0; + while(ticks--) do_IRQ(TIMER_IRQ, regs); +} + +void boot_timer_handler(int sig) +{ + struct pt_regs regs; + + CHOOSE_MODE((void) + (UPT_SC(®s.regs) = (struct sigcontext *) (&sig + 1)), + (void) (regs.regs.skas.is_user = 0)); + do_timer(®s); +} + +void um_timer(int irq, void *dev, struct pt_regs *regs) +{ + do_timer(regs); + write_seqlock(&xtime_lock); + timer(); + write_sequnlock(&xtime_lock); +} + +long um_time(int * tloc) +{ + struct timeval now; + + do_gettimeofday(&now); + if (tloc) { + if (put_user(now.tv_sec,tloc)) + now.tv_sec = -EFAULT; + } + return now.tv_sec; +} + +long um_stime(int * tptr) +{ + int value; + struct timeval new; + + if (get_user(value, tptr)) + return -EFAULT; + new.tv_sec = value; + new.tv_usec = 0; + do_settimeofday(&new); + return 0; +} + +/* XXX Needs to be moved under sys-i386 */ +void __delay(um_udelay_t time) +{ + /* Stolen from the i386 __loop_delay */ + int d0; + __asm__ __volatile__( + "\tjmp 1f\n" + ".align 16\n" + "1:\tjmp 2f\n" + ".align 16\n" + "2:\tdecl %0\n\tjns 2b" + :"=&a" (d0) + :"0" (time)); +} + +void __udelay(um_udelay_t usecs) +{ + int i, n; + + n = (loops_per_jiffy * HZ * usecs) / 1000000; + for(i=0;i<n;i++) ; +} + +void __const_udelay(um_udelay_t usecs) +{ + int i, n; + + n = (loops_per_jiffy * HZ * usecs) / 1000000; + for(i=0;i<n;i++) ; +} + +void timer_handler(int sig, union uml_pt_regs *regs) +{ +#ifdef CONFIG_SMP + update_process_times(user_context(UPT_SP(regs))); +#endif + if(current->thread_info->cpu == 0) + timer_irq(regs); +} + +static spinlock_t timer_spinlock = SPIN_LOCK_UNLOCKED; + +unsigned long time_lock(void) +{ + unsigned long flags; + spin_lock_irqsave(&timer_spinlock, flags); + return(flags); +} + +void time_unlock(unsigned long flags) +{ + spin_unlock_irqrestore(&timer_spinlock, flags); +} + +int __init timer_init(void) +{ + int err; + + CHOOSE_MODE(user_time_init_tt(), user_time_init_skas()); + if((err = request_irq(TIMER_IRQ, um_timer, SA_INTERRUPT, "timer", + NULL)) != 0) + printk(KERN_ERR "timer_init : request_irq failed - " + "errno = %d\n", -err); + timer_irq_inited = 1; + return(0); +} + +__initcall(timer_init); + + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/trap_kern.c b/arch/um/kernel/trap_kern.c --- a/arch/um/kernel/trap_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/trap_kern.c Fri Oct 31 14:10:53 2003 @@ -16,6 +16,7 @@ #include "asm/tlbflush.h" #include "asm/a.out.h" #include "asm/current.h" +#include "asm/irq.h" #include "user_util.h" #include "kern_util.h" #include "kern.h" @@ -51,7 +52,7 @@ if(is_write && !(vma->vm_flags & VM_WRITE)) goto out; page = address & PAGE_MASK; - if(page == (unsigned long) current->thread_info + PAGE_SIZE) + if(page == (unsigned long) current_thread + PAGE_SIZE) panic("Kernel stack overflow"); pgd = pgd_offset(mm, page); pmd = pmd_offset(pgd, page); @@ -178,6 +179,11 @@ if(current->thread.fault_catcher != NULL) do_longjmp(current->thread.fault_catcher, 1); else relay_signal(sig, regs); +} + +void winch(int sig, union uml_pt_regs *regs) +{ + do_IRQ(WINCH_IRQ, regs); } void trap_init(void) diff -Nru a/arch/um/kernel/trap_kern.c~uml-summa.diff b/arch/um/kernel/trap_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/trap_kern.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "asm/errno.h" +#include "linux/sched.h" +#include "linux/mm.h" +#include "linux/spinlock.h" +#include "linux/config.h" +#include "linux/init.h" +#include "linux/ptrace.h" +#include "asm/semaphore.h" +#include "asm/pgtable.h" +#include "asm/tlbflush.h" +#include "asm/a.out.h" +#include "asm/current.h" +#include "user_util.h" +#include "kern_util.h" +#include "kern.h" +#include "chan_kern.h" +#include "mconsole_kern.h" +#include "2_5compat.h" + +int handle_page_fault(unsigned long address, unsigned long ip, + int is_write, int is_user, int *code_out) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long page; + int err = -EFAULT; + + *code_out = SEGV_MAPERR; + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); + if(!vma) + goto out; + else if(vma->vm_start <= address) + goto good_area; + else if(!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + else if(expand_stack(vma, address)) + goto out; + + good_area: + *code_out = SEGV_ACCERR; + if(is_write && !(vma->vm_flags & VM_WRITE)) + goto out; + page = address & PAGE_MASK; + if(page == (unsigned long) current->thread_info + PAGE_SIZE) + panic("Kernel stack overflow"); + pgd = pgd_offset(mm, page); + pmd = pmd_offset(pgd, page); + survive: + do { + switch (handle_mm_fault(mm, vma, address, is_write)){ + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + case VM_FAULT_SIGBUS: + err = -EACCES; + goto out; + case VM_FAULT_OOM: + err = -ENOMEM; + goto out_of_memory; + default: + BUG(); + } + pte = pte_offset_kernel(pmd, page); + } while(!pte_present(*pte)); + *pte = pte_mkyoung(*pte); + if(pte_write(*pte)) *pte = pte_mkdirty(*pte); + flush_tlb_page(vma, page); + err = 0; + out: + up_read(&mm->mmap_sem); + return(err); + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + if (current->pid == 1) { + up_read(&mm->mmap_sem); + yield(); + down_read(&mm->mmap_sem); + goto survive; + } + err = -ENOMEM; + goto out; +} + +unsigned long segv(unsigned long address, unsigned long ip, int is_write, + int is_user, void *sc) +{ + struct siginfo si; + void *catcher; + int err; + + if(!is_user && (address >= start_vm) && (address < end_vm)){ + flush_tlb_kernel_vm(); + return(0); + } + if(current->mm == NULL) + panic("Segfault with no mm"); + err = handle_page_fault(address, ip, is_write, is_user, &si.si_code); + + catcher = current->thread.fault_catcher; + if(!err) + return(0); + else if(catcher != NULL){ + current->thread.fault_addr = (void *) address; + do_longjmp(catcher, 1); + } + else if(current->thread.fault_addr != NULL){ + panic("fault_addr set but no fault catcher"); + } + else if(arch_fixup(ip, sc)) + return(0); + + if(!is_user) + panic("Kernel mode fault at addr 0x%lx, ip 0x%lx", + address, ip); + + if(err == -EACCES){ + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_ADRERR; + si.si_addr = (void *)address; + force_sig_info(SIGBUS, &si, current); + } + else if(err == -ENOMEM){ + printk("VM: killing process %s\n", current->comm); + do_exit(SIGKILL); + } + else { + si.si_signo = SIGSEGV; + si.si_addr = (void *) address; + current->thread.cr2 = address; + current->thread.err = is_write; + force_sig_info(SIGSEGV, &si, current); + } + return(0); +} + +void bad_segv(unsigned long address, unsigned long ip, int is_write) +{ + struct siginfo si; + + printk(KERN_ERR "Unfixable SEGV in '%s' (pid %d) at 0x%lx " + "(ip 0x%lx)\n", current->comm, current->pid, address, ip); + si.si_signo = SIGSEGV; + si.si_code = SEGV_ACCERR; + si.si_addr = (void *) address; + current->thread.cr2 = address; + current->thread.err = is_write; + force_sig_info(SIGSEGV, &si, current); +} + +void relay_signal(int sig, union uml_pt_regs *regs) +{ + if(arch_handle_signal(sig, regs)) return; + if(!UPT_IS_USER(regs)) + panic("Kernel mode signal %d", sig); + force_sig(sig, current); +} + +void bus_handler(int sig, union uml_pt_regs *regs) +{ + if(current->thread.fault_catcher != NULL) + do_longjmp(current->thread.fault_catcher, 1); + else relay_signal(sig, regs); +} + +void trap_init(void) +{ +} + +spinlock_t trap_lock = SPIN_LOCK_UNLOCKED; + +static int trap_index = 0; + +int next_trap_index(int limit) +{ + int ret; + + spin_lock(&trap_lock); + ret = trap_index; + if(++trap_index == limit) + trap_index = 0; + spin_unlock(&trap_lock); + return(ret); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/trap_user.c b/arch/um/kernel/trap_user.c --- a/arch/um/kernel/trap_user.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/trap_user.c Fri Oct 31 14:10:53 2003 @@ -82,6 +82,8 @@ .is_irq = 0 }, [ SIGILL ] { .handler = relay_signal, .is_irq = 0 }, + [ SIGWINCH ] { .handler = winch, + .is_irq = 1 }, [ SIGBUS ] { .handler = bus_handler, .is_irq = 0 }, [ SIGSEGV] { .handler = segv_handler, @@ -121,9 +123,9 @@ void do_longjmp(void *b, int val) { - jmp_buf *buf = b; + sigjmp_buf *buf = b; - longjmp(*buf, val); + siglongjmp(*buf, val); } /* diff -Nru a/arch/um/kernel/trap_user.c~uml-summa.diff b/arch/um/kernel/trap_user.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/trap_user.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,138 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> +#include <setjmp.h> +#include <signal.h> +#include <sys/time.h> +#include <sys/ioctl.h> +#include <sys/ptrace.h> +#include <sys/wait.h> +#include <asm/page.h> +#include <asm/unistd.h> +#include <asm/ptrace.h> +#include "init.h" +#include "sysdep/ptrace.h" +#include "sigcontext.h" +#include "sysdep/sigcontext.h" +#include "irq_user.h" +#include "frame_user.h" +#include "signal_user.h" +#include "time_user.h" +#include "task.h" +#include "mode.h" +#include "choose-mode.h" +#include "kern_util.h" +#include "user_util.h" +#include "os.h" + +void kill_child_dead(int pid) +{ + kill(pid, SIGKILL); + kill(pid, SIGCONT); + while(waitpid(pid, NULL, 0) > 0) kill(pid, SIGCONT); +} + +/* Unlocked - don't care if this is a bit off */ +int nsegfaults = 0; + +struct { + unsigned long address; + int is_write; + int pid; + unsigned long sp; + int is_user; +} segfault_record[1024]; + +void segv_handler(int sig, union uml_pt_regs *regs) +{ + int index, max; + + if(UPT_IS_USER(regs) && !UPT_SEGV_IS_FIXABLE(regs)){ + bad_segv(UPT_FAULT_ADDR(regs), UPT_IP(regs), + UPT_FAULT_WRITE(regs)); + return; + } + max = sizeof(segfault_record)/sizeof(segfault_record[0]); + index = next_trap_index(max); + + nsegfaults++; + segfault_record[index].address = UPT_FAULT_ADDR(regs); + segfault_record[index].pid = os_getpid(); + segfault_record[index].is_write = UPT_FAULT_WRITE(regs); + segfault_record[index].sp = UPT_SP(regs); + segfault_record[index].is_user = UPT_IS_USER(regs); + segv(UPT_FAULT_ADDR(regs), UPT_IP(regs), UPT_FAULT_WRITE(regs), + UPT_IS_USER(regs), regs); +} + +void usr2_handler(int sig, union uml_pt_regs *regs) +{ + CHOOSE_MODE(syscall_handler_tt(sig, regs), (void) 0); +} + +struct signal_info sig_info[] = { + [ SIGTRAP ] { .handler = relay_signal, + .is_irq = 0 }, + [ SIGFPE ] { .handler = relay_signal, + .is_irq = 0 }, + [ SIGILL ] { .handler = relay_signal, + .is_irq = 0 }, + [ SIGBUS ] { .handler = bus_handler, + .is_irq = 0 }, + [ SIGSEGV] { .handler = segv_handler, + .is_irq = 0 }, + [ SIGIO ] { .handler = sigio_handler, + .is_irq = 1 }, + [ SIGVTALRM ] { .handler = timer_handler, + .is_irq = 1 }, + [ SIGALRM ] { .handler = timer_handler, + .is_irq = 1 }, + [ SIGUSR2 ] { .handler = usr2_handler, + .is_irq = 0 }, +}; + +void sig_handler(int sig, struct sigcontext sc) +{ + CHOOSE_MODE_PROC(sig_handler_common_tt, sig_handler_common_skas, + sig, &sc); +} + +extern int timer_irq_inited, missed_ticks[]; + +void alarm_handler(int sig, struct sigcontext sc) +{ + if(!timer_irq_inited) return; + missed_ticks[cpu()]++; + + if(sig == SIGALRM) + switch_timers(0); + + CHOOSE_MODE_PROC(sig_handler_common_tt, sig_handler_common_skas, + sig, &sc); + + if(sig == SIGALRM) + switch_timers(1); +} + +void do_longjmp(void *b, int val) +{ + jmp_buf *buf = b; + + longjmp(*buf, val); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/tt/exec_kern.c b/arch/um/kernel/tt/exec_kern.c --- a/arch/um/kernel/tt/exec_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/tt/exec_kern.c Fri Oct 31 14:10:53 2003 @@ -47,17 +47,17 @@ do_exit(SIGKILL); } - if(current->thread_info->cpu == 0) + if(current_thread->cpu == 0) forward_interrupts(new_pid); current->thread.request.op = OP_EXEC; current->thread.request.u.exec.pid = new_pid; - unprotect_stack((unsigned long) current->thread_info); + unprotect_stack((unsigned long) current_thread); os_usr1_process(os_getpid()); enable_timer(); free_page(stack); protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 1, 0, 1); - task_protections((unsigned long) current->thread_info); + task_protections((unsigned long) current_thread); force_flush_all(); unblock_signals(); } diff -Nru a/arch/um/kernel/tt/exec_kern.c~uml-summa.diff b/arch/um/kernel/tt/exec_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/tt/exec_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/mm.h" +#include "asm/signal.h" +#include "asm/ptrace.h" +#include "asm/uaccess.h" +#include "asm/pgalloc.h" +#include "asm/tlbflush.h" +#include "user_util.h" +#include "kern_util.h" +#include "irq_user.h" +#include "time_user.h" +#include "mem_user.h" +#include "os.h" +#include "tlb.h" + +static int exec_tramp(void *sig_stack) +{ + init_new_thread_stack(sig_stack, NULL); + init_new_thread_signals(1); + os_stop_process(os_getpid()); + return(0); +} + +void flush_thread_tt(void) +{ + unsigned long stack; + int new_pid; + + stack = alloc_stack(0, 0); + if(stack == 0){ + printk(KERN_ERR + "flush_thread : failed to allocate temporary stack\n"); + do_exit(SIGKILL); + } + + new_pid = start_fork_tramp((void *) current->thread.kernel_stack, + stack, 0, exec_tramp); + if(new_pid < 0){ + printk(KERN_ERR + "flush_thread : new thread failed, errno = %d\n", + -new_pid); + do_exit(SIGKILL); + } + + if(current->thread_info->cpu == 0) + forward_interrupts(new_pid); + current->thread.request.op = OP_EXEC; + current->thread.request.u.exec.pid = new_pid; + unprotect_stack((unsigned long) current->thread_info); + os_usr1_process(os_getpid()); + + enable_timer(); + free_page(stack); + protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 1, 0, 1); + task_protections((unsigned long) current->thread_info); + force_flush_all(); + unblock_signals(); +} + +void start_thread_tt(struct pt_regs *regs, unsigned long eip, + unsigned long esp) +{ + set_fs(USER_DS); + flush_tlb_mm(current->mm); + PT_REGS_IP(regs) = eip; + PT_REGS_SP(regs) = esp; + PT_FIX_EXEC_STACK(esp); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/tt/include/uaccess.h b/arch/um/kernel/tt/include/uaccess.h --- a/arch/um/kernel/tt/include/uaccess.h Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/tt/include/uaccess.h Fri Oct 31 14:10:54 2003 @@ -46,18 +46,20 @@ static inline int copy_from_user_tt(void *to, const void *from, int n) { - return(access_ok_tt(VERIFY_READ, from, n) ? - __do_copy_from_user(to, from, n, - ¤t->thread.fault_addr, - ¤t->thread.fault_catcher) : n); + if(!access_ok_tt(VERIFY_READ, from, n)) + return(n); + + return(__do_copy_from_user(to, from, n, ¤t->thread.fault_addr, + ¤t->thread.fault_catcher)); } static inline int copy_to_user_tt(void *to, const void *from, int n) { - return(access_ok_tt(VERIFY_WRITE, to, n) ? - __do_copy_to_user(to, from, n, - ¤t->thread.fault_addr, - ¤t->thread.fault_catcher) : n); + if(!access_ok_tt(VERIFY_WRITE, to, n)) + return(n); + + return(__do_copy_to_user(to, from, n, ¤t->thread.fault_addr, + ¤t->thread.fault_catcher)); } extern int __do_strncpy_from_user(char *dst, const char *src, size_t n, @@ -67,7 +69,9 @@ { int n; - if(!access_ok_tt(VERIFY_READ, src, 1)) return(-EFAULT); + if(!access_ok_tt(VERIFY_READ, src, 1)) + return(-EFAULT); + n = __do_strncpy_from_user(dst, src, count, ¤t->thread.fault_addr, ¤t->thread.fault_catcher); @@ -87,10 +91,11 @@ static inline int clear_user_tt(void *mem, int len) { - return(access_ok_tt(VERIFY_WRITE, mem, len) ? - __do_clear_user(mem, len, - ¤t->thread.fault_addr, - ¤t->thread.fault_catcher) : len); + if(!access_ok_tt(VERIFY_WRITE, mem, len)) + return(len); + + return(__do_clear_user(mem, len, ¤t->thread.fault_addr, + ¤t->thread.fault_catcher)); } extern int __do_strnlen_user(const char *str, unsigned long n, diff -Nru a/arch/um/kernel/tt/include/uaccess.h~uml-summa.diff b/arch/um/kernel/tt/include/uaccess.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/tt/include/uaccess.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __TT_UACCESS_H +#define __TT_UACCESS_H + +#include "linux/string.h" +#include "linux/sched.h" +#include "asm/processor.h" +#include "asm/errno.h" +#include "asm/current.h" +#include "asm/a.out.h" +#include "uml_uaccess.h" + +#define ABOVE_KMEM (16 * 1024 * 1024) + +extern unsigned long end_vm; +extern unsigned long uml_physmem; + +#define under_task_size(addr, size) \ + (((unsigned long) (addr) < TASK_SIZE) && \ + (((unsigned long) (addr) + (size)) < TASK_SIZE)) + +#define is_stack(addr, size) \ + (((unsigned long) (addr) < STACK_TOP) && \ + ((unsigned long) (addr) >= STACK_TOP - ABOVE_KMEM) && \ + (((unsigned long) (addr) + (size)) <= STACK_TOP)) + +#define access_ok_tt(type, addr, size) \ + ((type == VERIFY_READ) || (segment_eq(get_fs(), KERNEL_DS)) || \ + (((unsigned long) (addr) <= ((unsigned long) (addr) + (size))) && \ + (under_task_size(addr, size) || is_stack(addr, size)))) + +static inline int verify_area_tt(int type, const void * addr, + unsigned long size) +{ + return(access_ok_tt(type, addr, size) ? 0 : -EFAULT); +} + +extern unsigned long get_fault_addr(void); + +extern int __do_copy_from_user(void *to, const void *from, int n, + void **fault_addr, void **fault_catcher); + +static inline int copy_from_user_tt(void *to, const void *from, int n) +{ + return(access_ok_tt(VERIFY_READ, from, n) ? + __do_copy_from_user(to, from, n, + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher) : n); +} + +static inline int copy_to_user_tt(void *to, const void *from, int n) +{ + return(access_ok_tt(VERIFY_WRITE, to, n) ? + __do_copy_to_user(to, from, n, + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher) : n); +} + +extern int __do_strncpy_from_user(char *dst, const char *src, size_t n, + void **fault_addr, void **fault_catcher); + +static inline int strncpy_from_user_tt(char *dst, const char *src, int count) +{ + int n; + + if(!access_ok_tt(VERIFY_READ, src, 1)) return(-EFAULT); + n = __do_strncpy_from_user(dst, src, count, + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher); + if(n < 0) return(-EFAULT); + return(n); +} + +extern int __do_clear_user(void *mem, size_t len, void **fault_addr, + void **fault_catcher); + +static inline int __clear_user_tt(void *mem, int len) +{ + return(__do_clear_user(mem, len, + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher)); +} + +static inline int clear_user_tt(void *mem, int len) +{ + return(access_ok_tt(VERIFY_WRITE, mem, len) ? + __do_clear_user(mem, len, + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher) : len); +} + +extern int __do_strnlen_user(const char *str, unsigned long n, + void **fault_addr, void **fault_catcher); + +static inline int strnlen_user_tt(const void *str, int len) +{ + return(__do_strnlen_user(str, len, + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher)); +} + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/tt/process_kern.c b/arch/um/kernel/tt/process_kern.c --- a/arch/um/kernel/tt/process_kern.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/tt/process_kern.c Fri Oct 31 14:10:54 2003 @@ -104,7 +104,10 @@ void release_thread_tt(struct task_struct *task) { - os_kill_process(task->thread.mode.tt.extern_pid, 0); + int pid = task->thread.mode.tt.extern_pid; + + if(os_getpid() != pid) + os_kill_process(pid, 0); } void exit_thread_tt(void) @@ -125,27 +128,27 @@ UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); - block_signals(); + force_flush_all(); + if(current->thread.prev_sched != NULL) + schedule_tail(current->thread.prev_sched); + current->thread.prev_sched = NULL; + init_new_thread_signals(1); -#ifdef CONFIG_SMP - schedule_tail(current->thread.prev_sched); -#endif enable_timer(); free_page(current->thread.temp_stack); set_cmdline("(kernel thread)"); - force_flush_all(); - current->thread.prev_sched = NULL; change_sig(SIGUSR1, 1); change_sig(SIGVTALRM, 1); change_sig(SIGPROF, 1); - unblock_signals(); + local_irq_enable(); if(!run_kernel_thread(fn, arg, ¤t->thread.exec_buf)) do_exit(0); } static int new_thread_proc(void *stack) { + local_irq_disable(); init_new_thread_stack(stack, new_thread_handler); os_usr1_process(os_getpid()); return(0); @@ -165,35 +168,32 @@ UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); -#ifdef CONFIG_SMP - schedule_tail(NULL); -#endif + force_flush_all(); + if(current->thread.prev_sched != NULL) + schedule_tail(current->thread.prev_sched); + current->thread.prev_sched = NULL; + enable_timer(); change_sig(SIGVTALRM, 1); local_irq_enable(); - force_flush_all(); if(current->mm != current->parent->mm) protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 1, 0, 1); - task_protections((unsigned long) current->thread_info); - - current->thread.prev_sched = NULL; + task_protections((unsigned long) current_thread); free_page(current->thread.temp_stack); + local_irq_disable(); change_sig(SIGUSR1, 0); set_user_mode(current); } -static int sigusr1 = SIGUSR1; - int fork_tramp(void *stack) { - int sig = sigusr1; - local_irq_disable(); + arch_init_thread(); init_new_thread_stack(stack, finish_fork_handler); - kill(os_getpid(), sig); + os_usr1_process(os_getpid()); return(0); } @@ -377,8 +377,8 @@ pages = (1 << CONFIG_KERNEL_STACK_ORDER); - start = (unsigned long) current->thread_info + PAGE_SIZE; - end = (unsigned long) current + PAGE_SIZE * pages; + start = (unsigned long) current_thread + PAGE_SIZE; + end = (unsigned long) current_thread + PAGE_SIZE * pages; protect_memory(uml_reserved, start - uml_reserved, 1, w, 1, 1); protect_memory(end, high_physmem - end, 1, w, 1, 1); diff -Nru a/arch/um/kernel/tt/process_kern.c~uml-summa.diff b/arch/um/kernel/tt/process_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/tt/process_kern.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,522 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/signal.h" +#include "linux/kernel.h" +#include "linux/interrupt.h" +#include "linux/ptrace.h" +#include "asm/system.h" +#include "asm/pgalloc.h" +#include "asm/ptrace.h" +#include "asm/tlbflush.h" +#include "irq_user.h" +#include "signal_user.h" +#include "kern_util.h" +#include "user_util.h" +#include "os.h" +#include "kern.h" +#include "sigcontext.h" +#include "time_user.h" +#include "mem_user.h" +#include "tlb.h" +#include "mode.h" +#include "init.h" +#include "tt.h" + +void *switch_to_tt(void *prev, void *next, void *last) +{ + struct task_struct *from, *to; + unsigned long flags; + int err, vtalrm, alrm, prof, cpu; + char c; + /* jailing and SMP are incompatible, so this doesn't need to be + * made per-cpu + */ + static int reading; + + from = prev; + to = next; + + to->thread.prev_sched = from; + + cpu = from->thread_info->cpu; + if(cpu == 0) + forward_interrupts(to->thread.mode.tt.extern_pid); +#ifdef CONFIG_SMP + forward_ipi(cpu_data[cpu].ipi_pipe[0], to->thread.mode.tt.extern_pid); +#endif + local_irq_save(flags); + + vtalrm = change_sig(SIGVTALRM, 0); + alrm = change_sig(SIGALRM, 0); + prof = change_sig(SIGPROF, 0); + + forward_pending_sigio(to->thread.mode.tt.extern_pid); + + c = 0; + set_current(to); + + reading = 0; + err = os_write_file(to->thread.mode.tt.switch_pipe[1], &c, sizeof(c)); + if(err != sizeof(c)) + panic("write of switch_pipe failed, errno = %d", -err); + + reading = 1; + if((from->state == TASK_ZOMBIE) || (from->state == TASK_DEAD)) + os_kill_process(os_getpid(), 0); + + err = os_read_file(from->thread.mode.tt.switch_pipe[0], &c, sizeof(c)); + if(err != sizeof(c)) + panic("read of switch_pipe failed, errno = %d", -err); + + /* This works around a nasty race with 'jail'. If we are switching + * between two threads of a threaded app and the incoming process + * runs before the outgoing process reaches the read, and it makes + * it all the way out to userspace, then it will have write-protected + * the outgoing process stack. Then, when the outgoing process + * returns from the write, it will segfault because it can no longer + * write its own stack. So, in order to avoid that, the incoming + * thread sits in a loop yielding until 'reading' is set. This + * isn't entirely safe, since there may be a reschedule from a timer + * happening between setting 'reading' and sleeping in read. But, + * it should get a whole quantum in which to reach the read and sleep, + * which should be enough. + */ + + if(jail){ + while(!reading) sched_yield(); + } + + change_sig(SIGVTALRM, vtalrm); + change_sig(SIGALRM, alrm); + change_sig(SIGPROF, prof); + + arch_switch(); + + flush_tlb_all(); + local_irq_restore(flags); + + return(current->thread.prev_sched); +} + +void release_thread_tt(struct task_struct *task) +{ + os_kill_process(task->thread.mode.tt.extern_pid, 0); +} + +void exit_thread_tt(void) +{ + close(current->thread.mode.tt.switch_pipe[0]); + close(current->thread.mode.tt.switch_pipe[1]); +} + +void schedule_tail(task_t *prev); + +static void new_thread_handler(int sig) +{ + int (*fn)(void *); + void *arg; + + fn = current->thread.request.u.thread.proc; + arg = current->thread.request.u.thread.arg; + UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); + suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); + + block_signals(); + init_new_thread_signals(1); +#ifdef CONFIG_SMP + schedule_tail(current->thread.prev_sched); +#endif + enable_timer(); + free_page(current->thread.temp_stack); + set_cmdline("(kernel thread)"); + force_flush_all(); + + current->thread.prev_sched = NULL; + change_sig(SIGUSR1, 1); + change_sig(SIGVTALRM, 1); + change_sig(SIGPROF, 1); + unblock_signals(); + if(!run_kernel_thread(fn, arg, ¤t->thread.exec_buf)) + do_exit(0); +} + +static int new_thread_proc(void *stack) +{ + init_new_thread_stack(stack, new_thread_handler); + os_usr1_process(os_getpid()); + return(0); +} + +/* Signal masking - signals are blocked at the start of fork_tramp. They + * are re-enabled when finish_fork_handler is entered by fork_tramp hitting + * itself with a SIGUSR1. set_user_mode has to be run with SIGUSR1 off, + * so it is blocked before it's called. They are re-enabled on sigreturn + * despite the fact that they were blocked when the SIGUSR1 was issued because + * copy_thread copies the parent's signcontext, including the signal mask + * onto the signal frame. + */ + +void finish_fork_handler(int sig) +{ + UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); + suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); + +#ifdef CONFIG_SMP + schedule_tail(NULL); +#endif + enable_timer(); + change_sig(SIGVTALRM, 1); + local_irq_enable(); + force_flush_all(); + if(current->mm != current->parent->mm) + protect_memory(uml_reserved, high_physmem - uml_reserved, 1, + 1, 0, 1); + task_protections((unsigned long) current->thread_info); + + current->thread.prev_sched = NULL; + + free_page(current->thread.temp_stack); + change_sig(SIGUSR1, 0); + set_user_mode(current); +} + +static int sigusr1 = SIGUSR1; + +int fork_tramp(void *stack) +{ + int sig = sigusr1; + + local_irq_disable(); + init_new_thread_stack(stack, finish_fork_handler); + + kill(os_getpid(), sig); + return(0); +} + +int copy_thread_tt(int nr, unsigned long clone_flags, unsigned long sp, + unsigned long stack_top, struct task_struct * p, + struct pt_regs *regs) +{ + int (*tramp)(void *); + int new_pid, err; + unsigned long stack; + + if(current->thread.forking) + tramp = fork_tramp; + else { + tramp = new_thread_proc; + p->thread.request.u.thread = current->thread.request.u.thread; + } + + err = os_pipe(p->thread.mode.tt.switch_pipe, 1, 1); + if(err){ + printk("copy_thread : pipe failed, errno = %d\n", -err); + return(err); + } + + stack = alloc_stack(0, 0); + if(stack == 0){ + printk(KERN_ERR "copy_thread : failed to allocate " + "temporary stack\n"); + return(-ENOMEM); + } + + clone_flags &= CLONE_VM; + p->thread.temp_stack = stack; + new_pid = start_fork_tramp((void *) p->thread.kernel_stack, stack, + clone_flags, tramp); + if(new_pid < 0){ + printk(KERN_ERR "copy_thread : clone failed - errno = %d\n", + -new_pid); + return(new_pid); + } + + if(current->thread.forking){ + sc_to_sc(UPT_SC(&p->thread.regs.regs), + UPT_SC(¤t->thread.regs.regs)); + SC_SET_SYSCALL_RETURN(UPT_SC(&p->thread.regs.regs), 0); + if(sp != 0) SC_SP(UPT_SC(&p->thread.regs.regs)) = sp; + } + p->thread.mode.tt.extern_pid = new_pid; + + current->thread.request.op = OP_FORK; + current->thread.request.u.fork.pid = new_pid; + os_usr1_process(os_getpid()); + return(0); +} + +void reboot_tt(void) +{ + current->thread.request.op = OP_REBOOT; + os_usr1_process(os_getpid()); +} + +void halt_tt(void) +{ + current->thread.request.op = OP_HALT; + os_usr1_process(os_getpid()); +} + +void kill_off_processes_tt(void) +{ + struct task_struct *p; + int me; + + me = os_getpid(); + for_each_process(p){ + if(p->thread.mode.tt.extern_pid != me) + os_kill_process(p->thread.mode.tt.extern_pid, 0); + } + if(init_task.thread.mode.tt.extern_pid != me) + os_kill_process(init_task.thread.mode.tt.extern_pid, 0); +} + +void initial_thread_cb_tt(void (*proc)(void *), void *arg) +{ + if(os_getpid() == tracing_pid){ + (*proc)(arg); + } + else { + current->thread.request.op = OP_CB; + current->thread.request.u.cb.proc = proc; + current->thread.request.u.cb.arg = arg; + os_usr1_process(os_getpid()); + } +} + +int do_proc_op(void *t, int proc_id) +{ + struct task_struct *task; + struct thread_struct *thread; + int op, pid; + + task = t; + thread = &task->thread; + op = thread->request.op; + switch(op){ + case OP_NONE: + case OP_TRACE_ON: + break; + case OP_EXEC: + pid = thread->request.u.exec.pid; + do_exec(thread->mode.tt.extern_pid, pid); + thread->mode.tt.extern_pid = pid; + cpu_tasks[task->thread_info->cpu].pid = pid; + break; + case OP_FORK: + attach_process(thread->request.u.fork.pid); + break; + case OP_CB: + (*thread->request.u.cb.proc)(thread->request.u.cb.arg); + break; + case OP_REBOOT: + case OP_HALT: + break; + default: + tracer_panic("Bad op in do_proc_op"); + break; + } + thread->request.op = OP_NONE; + return(op); +} + +void init_idle_tt(void) +{ + default_idle(); +} + +/* Changed by jail_setup, which is a setup */ +int jail = 0; + +int __init jail_setup(char *line, int *add) +{ + int ok = 1; + + if(jail) return(0); +#ifdef CONFIG_SMP + printf("'jail' may not used used in a kernel with CONFIG_SMP " + "enabled\n"); + ok = 0; +#endif +#ifdef CONFIG_HOSTFS + printf("'jail' may not used used in a kernel with CONFIG_HOSTFS " + "enabled\n"); + ok = 0; +#endif +#ifdef CONFIG_MODULES + printf("'jail' may not used used in a kernel with CONFIG_MODULES " + "enabled\n"); + ok = 0; +#endif + if(!ok) exit(1); + + /* CAP_SYS_RAWIO controls the ability to open /dev/mem and /dev/kmem. + * Removing it from the bounding set eliminates the ability of anything + * to acquire it, and thus read or write kernel memory. + */ + cap_lower(cap_bset, CAP_SYS_RAWIO); + jail = 1; + return(0); +} + +__uml_setup("jail", jail_setup, +"jail\n" +" Enables the protection of kernel memory from processes.\n\n" +); + +static void mprotect_kernel_mem(int w) +{ + unsigned long start, end; + int pages; + + if(!jail || (current == &init_task)) return; + + pages = (1 << CONFIG_KERNEL_STACK_ORDER); + + start = (unsigned long) current->thread_info + PAGE_SIZE; + end = (unsigned long) current + PAGE_SIZE * pages; + protect_memory(uml_reserved, start - uml_reserved, 1, w, 1, 1); + protect_memory(end, high_physmem - end, 1, w, 1, 1); + + start = (unsigned long) UML_ROUND_DOWN(&_stext); + end = (unsigned long) UML_ROUND_UP(&_etext); + protect_memory(start, end - start, 1, w, 1, 1); + + start = (unsigned long) UML_ROUND_DOWN(&_unprotected_end); + end = (unsigned long) UML_ROUND_UP(&_edata); + protect_memory(start, end - start, 1, w, 1, 1); + + start = (unsigned long) UML_ROUND_DOWN(&__bss_start); + end = (unsigned long) UML_ROUND_UP(brk_start); + protect_memory(start, end - start, 1, w, 1, 1); + + mprotect_kernel_vm(w); +} + +void unprotect_kernel_mem(void) +{ + mprotect_kernel_mem(1); +} + +void protect_kernel_mem(void) +{ + mprotect_kernel_mem(0); +} + +extern void start_kernel(void); + +static int start_kernel_proc(void *unused) +{ + int pid; + + block_signals(); + pid = os_getpid(); + + cpu_tasks[0].pid = pid; + cpu_tasks[0].task = current; +#ifdef CONFIG_SMP + cpu_online_map = cpumask_of_cpu(0); +#endif + if(debug) os_stop_process(pid); + start_kernel(); + return(0); +} + +void set_tracing(void *task, int tracing) +{ + ((struct task_struct *) task)->thread.mode.tt.tracing = tracing; +} + +int is_tracing(void *t) +{ + return (((struct task_struct *) t)->thread.mode.tt.tracing); +} + +int set_user_mode(void *t) +{ + struct task_struct *task; + + task = t ? t : current; + if(task->thread.mode.tt.tracing) + return(1); + task->thread.request.op = OP_TRACE_ON; + os_usr1_process(os_getpid()); + return(0); +} + +void set_init_pid(int pid) +{ + int err; + + init_task.thread.mode.tt.extern_pid = pid; + err = os_pipe(init_task.thread.mode.tt.switch_pipe, 1, 1); + if(err) panic("Can't create switch pipe for init_task, errno = %d", + err); +} + +int singlestepping_tt(void *t) +{ + struct task_struct *task = t; + + if(task->thread.mode.tt.singlestep_syscall) + return(0); + return(task->ptrace & PT_DTRACE); +} + +void clear_singlestep(void *t) +{ + struct task_struct *task = t; + + task->ptrace &= ~PT_DTRACE; +} + +int start_uml_tt(void) +{ + void *sp; + int pages; + + pages = (1 << CONFIG_KERNEL_STACK_ORDER) - 2; + sp = (void *) init_task.thread.kernel_stack + pages * PAGE_SIZE - + sizeof(unsigned long); + return(tracer(start_kernel_proc, sp)); +} + +int external_pid_tt(struct task_struct *task) +{ + return(task->thread.mode.tt.extern_pid); +} + +int thread_pid_tt(struct task_struct *task) +{ + return(task->thread.mode.tt.extern_pid); +} + +int is_valid_pid(int pid) +{ + struct task_struct *task; + + read_lock(&tasklist_lock); + for_each_process(task){ + if(task->thread.mode.tt.extern_pid == pid){ + read_unlock(&tasklist_lock); + return(1); + } + } + read_unlock(&tasklist_lock); + return(0); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/tt/ptproxy/proxy.c b/arch/um/kernel/tt/ptproxy/proxy.c --- a/arch/um/kernel/tt/ptproxy/proxy.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/tt/ptproxy/proxy.c Fri Oct 31 14:10:54 2003 @@ -293,11 +293,10 @@ } char gdb_init_string[] = -"att 1 -b panic -b stop -handle SIGWINCH nostop noprint pass -"; +"att 1\n" +"b panic\n" +"b stop\n" +"handle SIGWINCH nostop noprint pass\n"; int start_debugger(char *prog, int startup, int stop, int *fd_out) { diff -Nru a/arch/um/kernel/tt/ptproxy/proxy.c~uml-summa.diff b/arch/um/kernel/tt/ptproxy/proxy.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/tt/ptproxy/proxy.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,370 @@ +/********************************************************************** +proxy.c + +Copyright (C) 1999 Lars Brinkhoff. See the file COPYING for licensing +terms and conditions. + +Jeff Dike (jdike@karaya.com) : Modified for integration into uml +**********************************************************************/ + +/* XXX This file shouldn't refer to CONFIG_* */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <string.h> +#include <fcntl.h> +#include <termios.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/ptrace.h> +#include <sys/ioctl.h> +#include <asm/unistd.h> + +#include "ptproxy.h" +#include "sysdep.h" +#include "wait.h" + +#include "user_util.h" +#include "user.h" +#include "os.h" +#include "tempfile.h" + +static int debugger_wait(debugger_state *debugger, int *status, int options, + int (*syscall)(debugger_state *debugger, pid_t child), + int (*normal_return)(debugger_state *debugger, + pid_t unused), + int (*wait_return)(debugger_state *debugger, + pid_t unused)) +{ + if(debugger->real_wait){ + debugger->handle_trace = normal_return; + syscall_continue(debugger->pid); + debugger->real_wait = 0; + return(1); + } + debugger->wait_status_ptr = status; + debugger->wait_options = options; + if((debugger->debugee != NULL) && debugger->debugee->event){ + syscall_continue(debugger->pid); + wait_for_stop(debugger->pid, SIGTRAP, PTRACE_SYSCALL, + NULL); + (*wait_return)(debugger, -1); + return(0); + } + else if(debugger->wait_options & WNOHANG){ + syscall_cancel(debugger->pid, 0); + debugger->handle_trace = syscall; + return(0); + } + else { + syscall_pause(debugger->pid); + debugger->handle_trace = wait_return; + debugger->waiting = 1; + } + return(1); +} + +/* + * Handle debugger trap, i.e. syscall. + */ + +int debugger_syscall(debugger_state *debugger, pid_t child) +{ + long arg1, arg2, arg3, arg4, arg5, result; + int syscall, ret = 0; + + syscall = get_syscall(debugger->pid, &arg1, &arg2, &arg3, &arg4, + &arg5); + + switch(syscall){ + case __NR_execve: + /* execve never returns */ + debugger->handle_trace = debugger_syscall; + break; + + case __NR_ptrace: + if(debugger->debugee->pid != 0) arg2 = debugger->debugee->pid; + if(!debugger->debugee->in_context) + child = debugger->debugee->pid; + result = proxy_ptrace(debugger, arg1, arg2, arg3, arg4, child, + &ret); + syscall_cancel(debugger->pid, result); + debugger->handle_trace = debugger_syscall; + return(ret); + + case __NR_waitpid: + case __NR_wait4: + if(!debugger_wait(debugger, (int *) arg2, arg3, + debugger_syscall, debugger_normal_return, + proxy_wait_return)) + return(0); + break; + + case __NR_kill: + if(!debugger->debugee->in_context) + child = debugger->debugee->pid; + if(arg1 == debugger->debugee->pid){ + result = kill(child, arg2); + syscall_cancel(debugger->pid, result); + debugger->handle_trace = debugger_syscall; + return(0); + } + else debugger->handle_trace = debugger_normal_return; + break; + + default: + debugger->handle_trace = debugger_normal_return; + } + + syscall_continue(debugger->pid); + return(0); +} + +/* Used by the tracing thread */ +static debugger_state parent; +static int parent_syscall(debugger_state *debugger, int pid); + +int init_parent_proxy(int pid) +{ + parent = ((debugger_state) { .pid = pid, + .wait_options = 0, + .wait_status_ptr = NULL, + .waiting = 0, + .real_wait = 0, + .expecting_child = 0, + .handle_trace = parent_syscall, + .debugee = NULL } ); + return(0); +} + +int parent_normal_return(debugger_state *debugger, pid_t unused) +{ + debugger->handle_trace = parent_syscall; + syscall_continue(debugger->pid); + return(0); +} + +static int parent_syscall(debugger_state *debugger, int pid) +{ + long arg1, arg2, arg3, arg4, arg5; + int syscall; + + syscall = get_syscall(pid, &arg1, &arg2, &arg3, &arg4, &arg5); + + if((syscall == __NR_waitpid) || (syscall == __NR_wait4)){ + debugger_wait(&parent, (int *) arg2, arg3, parent_syscall, + parent_normal_return, parent_wait_return); + } + else ptrace(PTRACE_SYSCALL, pid, 0, 0); + return(0); +} + +int debugger_normal_return(debugger_state *debugger, pid_t unused) +{ + debugger->handle_trace = debugger_syscall; + syscall_continue(debugger->pid); + return(0); +} + +void debugger_cancelled_return(debugger_state *debugger, int result) +{ + debugger->handle_trace = debugger_syscall; + syscall_set_result(debugger->pid, result); + syscall_continue(debugger->pid); +} + +/* Used by the tracing thread */ +static debugger_state debugger; +static debugee_state debugee; + +void init_proxy (pid_t debugger_pid, int stopped, int status) +{ + debugger.pid = debugger_pid; + debugger.handle_trace = debugger_syscall; + debugger.debugee = &debugee; + debugger.waiting = 0; + debugger.real_wait = 0; + debugger.expecting_child = 0; + + debugee.pid = 0; + debugee.traced = 0; + debugee.stopped = stopped; + debugee.event = 0; + debugee.zombie = 0; + debugee.died = 0; + debugee.wait_status = status; + debugee.in_context = 1; +} + +int debugger_proxy(int status, int pid) +{ + int ret = 0, sig; + + if(WIFSTOPPED(status)){ + sig = WSTOPSIG(status); + if (sig == SIGTRAP) + ret = (*debugger.handle_trace)(&debugger, pid); + + else if(sig == SIGCHLD){ + if(debugger.expecting_child){ + ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig); + debugger.expecting_child = 0; + } + else if(debugger.waiting) + real_wait_return(&debugger); + else { + ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig); + debugger.real_wait = 1; + } + } + else ptrace(PTRACE_SYSCALL, debugger.pid, 0, sig); + } + else if(WIFEXITED(status)){ + tracer_panic("debugger (pid %d) exited with status %d", + debugger.pid, WEXITSTATUS(status)); + } + else if(WIFSIGNALED(status)){ + tracer_panic("debugger (pid %d) exited with signal %d", + debugger.pid, WTERMSIG(status)); + } + else { + tracer_panic("proxy got unknown status (0x%x) on debugger " + "(pid %d)", status, debugger.pid); + } + return(ret); +} + +void child_proxy(pid_t pid, int status) +{ + debugee.event = 1; + debugee.wait_status = status; + + if(WIFSTOPPED(status)){ + debugee.stopped = 1; + debugger.expecting_child = 1; + kill(debugger.pid, SIGCHLD); + } + else if(WIFEXITED(status) || WIFSIGNALED(status)){ + debugee.zombie = 1; + debugger.expecting_child = 1; + kill(debugger.pid, SIGCHLD); + } + else panic("proxy got unknown status (0x%x) on child (pid %d)", + status, pid); +} + +void debugger_parent_signal(int status, int pid) +{ + int sig; + + if(WIFSTOPPED(status)){ + sig = WSTOPSIG(status); + if(sig == SIGTRAP) (*parent.handle_trace)(&parent, pid); + else ptrace(PTRACE_SYSCALL, pid, 0, sig); + } +} + +void fake_child_exit(void) +{ + int status, pid; + + child_proxy(1, W_EXITCODE(0, 0)); + while(debugger.waiting == 1){ + pid = waitpid(debugger.pid, &status, WUNTRACED); + if(pid != debugger.pid){ + printk("fake_child_exit - waitpid failed, " + "errno = %d\n", errno); + return; + } + debugger_proxy(status, debugger.pid); + } + pid = waitpid(debugger.pid, &status, WUNTRACED); + if(pid != debugger.pid){ + printk("fake_child_exit - waitpid failed, " + "errno = %d\n", errno); + return; + } + if(ptrace(PTRACE_DETACH, debugger.pid, 0, SIGCONT) < 0) + printk("fake_child_exit - PTRACE_DETACH failed, errno = %d\n", + errno); +} + +char gdb_init_string[] = +"att 1 +b panic +b stop +handle SIGWINCH nostop noprint pass +"; + +int start_debugger(char *prog, int startup, int stop, int *fd_out) +{ + int slave, child; + + slave = open_gdb_chan(); + if((child = fork()) == 0){ + char *tempname = NULL; + int fd; + + if(setsid() < 0) perror("setsid"); + if((dup2(slave, 0) < 0) || (dup2(slave, 1) < 0) || + (dup2(slave, 2) < 0)){ + printk("start_debugger : dup2 failed, errno = %d\n", + errno); + exit(1); + } + if(ioctl(0, TIOCSCTTY, 0) < 0){ + printk("start_debugger : TIOCSCTTY failed, " + "errno = %d\n", errno); + exit(1); + } + if(tcsetpgrp (1, os_getpid()) < 0){ + printk("start_debugger : tcsetpgrp failed, " + "errno = %d\n", errno); +#ifdef notdef + exit(1); +#endif + } + if((fd = make_tempfile("/tmp/gdb_init-XXXXXX", &tempname, 0)) < 0){ + printk("start_debugger : make_tempfile failed, errno = %d\n", + errno); + exit(1); + } + write(fd, gdb_init_string, sizeof(gdb_init_string) - 1); + if(startup){ + if(stop){ + write(fd, "b start_kernel\n", + strlen("b start_kernel\n")); + } + write(fd, "c\n", strlen("c\n")); + } + if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0){ + printk("start_debugger : PTRACE_TRACEME failed, " + "errno = %d\n", errno); + exit(1); + } + execlp("gdb", "gdb", "--command", tempname, prog, NULL); + printk("start_debugger : exec of gdb failed, errno = %d\n", + errno); + } + if(child < 0){ + printk("start_debugger : fork for gdb failed, errno = %d\n", + errno); + return(-1); + } + *fd_out = slave; + return(child); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/tt/tlb.c b/arch/um/kernel/tt/tlb.c --- a/arch/um/kernel/tt/tlb.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/tt/tlb.c Fri Oct 31 14:10:53 2003 @@ -10,6 +10,7 @@ #include "asm/page.h" #include "asm/pgtable.h" #include "asm/uaccess.h" +#include "asm/tlbflush.h" #include "user_util.h" #include "mem_user.h" #include "os.h" diff -Nru a/arch/um/kernel/tt/tlb.c~uml-summa.diff b/arch/um/kernel/tt/tlb.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/tt/tlb.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,226 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/stddef.h" +#include "linux/kernel.h" +#include "linux/sched.h" +#include "linux/mm.h" +#include "asm/page.h" +#include "asm/pgtable.h" +#include "asm/uaccess.h" +#include "user_util.h" +#include "mem_user.h" +#include "os.h" + +static void fix_range(struct mm_struct *mm, unsigned long start_addr, + unsigned long end_addr, int force) +{ + pgd_t *npgd; + pmd_t *npmd; + pte_t *npte; + unsigned long addr; + int r, w, x, err; + + if((current->thread.mode.tt.extern_pid != -1) && + (current->thread.mode.tt.extern_pid != os_getpid())) + panic("fix_range fixing wrong address space, current = 0x%p", + current); + if(mm == NULL) return; + for(addr=start_addr;addr<end_addr;){ + if(addr == TASK_SIZE){ + /* Skip over kernel text, kernel data, and physical + * memory, which don't have ptes, plus kernel virtual + * memory, which is flushed separately, and remap + * the process stack. The only way to get here is + * if (end_addr == STACK_TOP) > TASK_SIZE, which is + * only true in the honeypot case. + */ + addr = STACK_TOP - ABOVE_KMEM; + continue; + } + npgd = pgd_offset(mm, addr); + npmd = pmd_offset(npgd, addr); + if(pmd_present(*npmd)){ + npte = pte_offset_kernel(npmd, addr); + r = pte_read(*npte); + w = pte_write(*npte); + x = pte_exec(*npte); + if(!pte_dirty(*npte)) w = 0; + if(!pte_young(*npte)){ + r = 0; + w = 0; + } + if(force || pte_newpage(*npte)){ + err = os_unmap_memory((void *) addr, + PAGE_SIZE); + if(err < 0) + panic("munmap failed, errno = %d\n", + -err); + if(pte_present(*npte)) + map_memory(addr, + pte_val(*npte) & PAGE_MASK, + PAGE_SIZE, r, w, x); + } + else if(pte_newprot(*npte)){ + protect_memory(addr, PAGE_SIZE, r, w, x, 1); + } + *npte = pte_mkuptodate(*npte); + addr += PAGE_SIZE; + } + else { + if(force || pmd_newpage(*npmd)){ + err = os_unmap_memory((void *) addr, PMD_SIZE); + if(err < 0) + panic("munmap failed, errno = %d\n", + -err); + pmd_mkuptodate(*npmd); + } + addr += PMD_SIZE; + } + } +} + +atomic_t vmchange_seq = ATOMIC_INIT(1); + +static void flush_kernel_vm_range(unsigned long start, unsigned long end, + int update_seq) +{ + struct mm_struct *mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long addr; + int updated = 0, err; + + mm = &init_mm; + for(addr = start; addr < end;){ + pgd = pgd_offset(mm, addr); + pmd = pmd_offset(pgd, addr); + if(pmd_present(*pmd)){ + pte = pte_offset_kernel(pmd, addr); + if(!pte_present(*pte) || pte_newpage(*pte)){ + updated = 1; + err = os_unmap_memory((void *) addr, + PAGE_SIZE); + if(err < 0) + panic("munmap failed, errno = %d\n", + -err); + if(pte_present(*pte)) + map_memory(addr, + pte_val(*pte) & PAGE_MASK, + PAGE_SIZE, 1, 1, 1); + } + else if(pte_newprot(*pte)){ + updated = 1; + protect_memory(addr, PAGE_SIZE, 1, 1, 1, 1); + } + addr += PAGE_SIZE; + } + else { + if(pmd_newpage(*pmd)){ + updated = 1; + err = os_unmap_memory((void *) addr, PMD_SIZE); + if(err < 0) + panic("munmap failed, errno = %d\n", + -err); + } + addr += PMD_SIZE; + } + } + if(updated && update_seq) atomic_inc(&vmchange_seq); +} + +void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end) +{ + flush_kernel_vm_range(start, end, 1); +} + +static void protect_vm_page(unsigned long addr, int w, int must_succeed) +{ + int err; + + err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed); + if(err == 0) return; + else if((err == -EFAULT) || (err == -ENOMEM)){ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + protect_vm_page(addr, w, 1); + } + else panic("protect_vm_page : protect failed, errno = %d\n", err); +} + +void mprotect_kernel_vm(int w) +{ + struct mm_struct *mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + unsigned long addr; + + mm = &init_mm; + for(addr = start_vm; addr < end_vm;){ + pgd = pgd_offset(mm, addr); + pmd = pmd_offset(pgd, addr); + if(pmd_present(*pmd)){ + pte = pte_offset_kernel(pmd, addr); + if(pte_present(*pte)) protect_vm_page(addr, w, 0); + addr += PAGE_SIZE; + } + else addr += PMD_SIZE; + } +} + +void flush_tlb_kernel_vm_tt(void) +{ + flush_tlb_kernel_range(start_vm, end_vm); +} + +void __flush_tlb_one_tt(unsigned long addr) +{ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); +} + +void flush_tlb_range_tt(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + if(vma->vm_mm != current->mm) return; + + /* Assumes that the range start ... end is entirely within + * either process memory or kernel vm + */ + if((start >= start_vm) && (start < end_vm)) + flush_kernel_vm_range(start, end, 1); + else fix_range(vma->vm_mm, start, end, 0); +} + +void flush_tlb_mm_tt(struct mm_struct *mm) +{ + unsigned long seq; + + if(mm != current->mm) return; + + fix_range(mm, 0, STACK_TOP, 0); + + seq = atomic_read(&vmchange_seq); + if(current->thread.mode.tt.vm_seq == seq) return; + current->thread.mode.tt.vm_seq = seq; + flush_kernel_vm_range(start_vm, end_vm, 0); +} + +void force_flush_all_tt(void) +{ + fix_range(current->mm, 0, STACK_TOP, 1); + flush_kernel_vm_range(start_vm, end_vm, 0); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/tt/tracer.c b/arch/um/kernel/tt/tracer.c --- a/arch/um/kernel/tt/tracer.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/tt/tracer.c Fri Oct 31 14:10:53 2003 @@ -39,7 +39,7 @@ return(0); register_winch_irq(tracer_winch[0], fd, -1, data); - return(0); + return(1); } static void tracer_winch_handler(int sig) @@ -401,7 +401,7 @@ if(!strcmp(line, "go")) debug_stop = 0; else if(!strcmp(line, "parent")) debug_parent = 1; - else printk("Unknown debug option : '%s'\n", line); + else printf("Unknown debug option : '%s'\n", line); line = next; } diff -Nru a/arch/um/kernel/tt/tracer.c~uml-summa.diff b/arch/um/kernel/tt/tracer.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/tt/tracer.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,453 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <sched.h> +#include <string.h> +#include <sys/mman.h> +#include <sys/ptrace.h> +#include <sys/time.h> +#include <sys/wait.h> +#include "user.h" +#include "sysdep/ptrace.h" +#include "sigcontext.h" +#include "sysdep/sigcontext.h" +#include "os.h" +#include "signal_user.h" +#include "user_util.h" +#include "mem_user.h" +#include "process.h" +#include "kern_util.h" +#include "frame.h" +#include "chan_user.h" +#include "ptrace_user.h" +#include "mode.h" +#include "tt.h" + +static int tracer_winch[2]; + +int is_tracer_winch(int pid, int fd, void *data) +{ + if(pid != tracing_pid) + return(0); + + register_winch_irq(tracer_winch[0], fd, -1, data); + return(0); +} + +static void tracer_winch_handler(int sig) +{ + char c = 1; + + if(write(tracer_winch[1], &c, sizeof(c)) != sizeof(c)) + printk("tracer_winch_handler - write failed, errno = %d\n", + errno); +} + +/* Called only by the tracing thread during initialization */ + +static void setup_tracer_winch(void) +{ + int err; + + err = os_pipe(tracer_winch, 1, 1); + if(err){ + printk("setup_tracer_winch : os_pipe failed, errno = %d\n", + -err); + return; + } + signal(SIGWINCH, tracer_winch_handler); +} + +void attach_process(int pid) +{ + if((ptrace(PTRACE_ATTACH, pid, 0, 0) < 0) || + (ptrace(PTRACE_CONT, pid, 0, 0) < 0)) + tracer_panic("OP_FORK failed to attach pid"); + wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL); + if(ptrace(PTRACE_CONT, pid, 0, 0) < 0) + tracer_panic("OP_FORK failed to continue process"); +} + +void tracer_panic(char *format, ...) +{ + va_list ap; + + va_start(ap, format); + vprintf(format, ap); + printf("\n"); + while(1) pause(); +} + +static void tracer_segv(int sig, struct sigcontext sc) +{ + printf("Tracing thread segfault at address 0x%lx, ip 0x%lx\n", + SC_FAULT_ADDR(&sc), SC_IP(&sc)); + while(1) + pause(); +} + +/* Changed early in boot, and then only read */ +int debug = 0; +int debug_stop = 1; +int debug_parent = 0; +int honeypot = 0; + +static int signal_tramp(void *arg) +{ + int (*proc)(void *); + + if(honeypot && munmap((void *) (host_task_size - 0x10000000), + 0x10000000)) + panic("Unmapping stack failed"); + if(ptrace(PTRACE_TRACEME, 0, 0, 0) < 0) + panic("ptrace PTRACE_TRACEME failed"); + os_stop_process(os_getpid()); + change_sig(SIGWINCH, 0); + signal(SIGUSR1, SIG_IGN); + change_sig(SIGCHLD, 0); + signal(SIGSEGV, (__sighandler_t) sig_handler); + set_cmdline("(idle thread)"); + set_init_pid(os_getpid()); + proc = arg; + return((*proc)(NULL)); +} + +static void sleeping_process_signal(int pid, int sig) +{ + switch(sig){ + /* These two result from UML being ^Z-ed and bg-ed. PTRACE_CONT is + * right because the process must be in the kernel already. + */ + case SIGCONT: + case SIGTSTP: + if(ptrace(PTRACE_CONT, pid, 0, sig) < 0) + tracer_panic("sleeping_process_signal : Failed to " + "continue pid %d, errno = %d\n", pid, + sig); + break; + + /* This happens when the debugger (e.g. strace) is doing system call + * tracing on the kernel. During a context switch, the current task + * will be set to the incoming process and the outgoing process will + * hop into write and then read. Since it's not the current process + * any more, the trace of those will land here. So, we need to just + * PTRACE_SYSCALL it. + */ + case SIGTRAP: + if(ptrace(PTRACE_SYSCALL, pid, 0, 0) < 0) + tracer_panic("sleeping_process_signal : Failed to " + "PTRACE_SYSCALL pid %d, errno = %d\n", + pid, sig); + break; + case SIGSTOP: + break; + default: + tracer_panic("sleeping process %d got unexpected " + "signal : %d\n", pid, sig); + break; + } +} + +/* Accessed only by the tracing thread */ +int debugger_pid = -1; +int debugger_parent = -1; +int debugger_fd = -1; +int gdb_pid = -1; + +struct { + int pid; + int signal; + unsigned long addr; + struct timeval time; +} signal_record[1024][32]; + +int signal_index[32]; +int nsignals = 0; +int debug_trace = 0; +extern int io_nsignals, io_count, intr_count; + +extern void signal_usr1(int sig); + +int tracing_pid = -1; + +int tracer(int (*init_proc)(void *), void *sp) +{ + void *task = NULL; + unsigned long eip = 0; + int status, pid = 0, sig = 0, cont_type, tracing = 0, op = 0; + int last_index, proc_id = 0, n, err, old_tracing = 0, strace = 0; + + capture_signal_stack(); + signal(SIGPIPE, SIG_IGN); + setup_tracer_winch(); + tracing_pid = os_getpid(); + printf("tracing thread pid = %d\n", tracing_pid); + + pid = clone(signal_tramp, sp, CLONE_FILES | SIGCHLD, init_proc); + n = waitpid(pid, &status, WUNTRACED); + if(n < 0){ + printf("waitpid on idle thread failed, errno = %d\n", errno); + exit(1); + } + if((ptrace(PTRACE_CONT, pid, 0, 0) < 0)){ + printf("Failed to continue idle thread, errno = %d\n", errno); + exit(1); + } + + signal(SIGSEGV, (sighandler_t) tracer_segv); + signal(SIGUSR1, signal_usr1); + if(debug_trace){ + printf("Tracing thread pausing to be attached\n"); + stop(); + } + if(debug){ + if(gdb_pid != -1) + debugger_pid = attach_debugger(pid, gdb_pid, 1); + else debugger_pid = init_ptrace_proxy(pid, 1, debug_stop); + if(debug_parent){ + debugger_parent = os_process_parent(debugger_pid); + init_parent_proxy(debugger_parent); + err = attach(debugger_parent); + if(err){ + printf("Failed to attach debugger parent %d, " + "errno = %d\n", debugger_parent, err); + debugger_parent = -1; + } + else { + if(ptrace(PTRACE_SYSCALL, debugger_parent, + 0, 0) < 0){ + printf("Failed to continue debugger " + "parent, errno = %d\n", errno); + debugger_parent = -1; + } + } + } + } + set_cmdline("(tracing thread)"); + while(1){ + if((pid = waitpid(-1, &status, WUNTRACED)) <= 0){ + if(errno != ECHILD){ + printf("wait failed - errno = %d\n", errno); + } + continue; + } + if(pid == debugger_pid){ + int cont = 0; + + if(WIFEXITED(status) || WIFSIGNALED(status)) + debugger_pid = -1; + /* XXX Figure out how to deal with gdb and SMP */ + else cont = debugger_signal(status, cpu_tasks[0].pid); + if(cont == PTRACE_SYSCALL) strace = 1; + continue; + } + else if(pid == debugger_parent){ + debugger_parent_signal(status, pid); + continue; + } + nsignals++; + if(WIFEXITED(status)) ; +#ifdef notdef + { + printf("Child %d exited with status %d\n", pid, + WEXITSTATUS(status)); + } +#endif + else if(WIFSIGNALED(status)){ + sig = WTERMSIG(status); + if(sig != 9){ + printf("Child %d exited with signal %d\n", pid, + sig); + } + } + else if(WIFSTOPPED(status)){ + proc_id = pid_to_processor_id(pid); + sig = WSTOPSIG(status); + if(signal_index[proc_id] == 1024){ + signal_index[proc_id] = 0; + last_index = 1023; + } + else last_index = signal_index[proc_id] - 1; + if(((sig == SIGPROF) || (sig == SIGVTALRM) || + (sig == SIGALRM)) && + (signal_record[proc_id][last_index].signal == sig)&& + (signal_record[proc_id][last_index].pid == pid)) + signal_index[proc_id] = last_index; + signal_record[proc_id][signal_index[proc_id]].pid = pid; + gettimeofday(&signal_record[proc_id][signal_index[proc_id]].time, NULL); + eip = ptrace(PTRACE_PEEKUSER, pid, PT_IP_OFFSET, 0); + signal_record[proc_id][signal_index[proc_id]].addr = eip; + signal_record[proc_id][signal_index[proc_id]++].signal = sig; + + if(proc_id == -1){ + sleeping_process_signal(pid, sig); + continue; + } + + task = cpu_tasks[proc_id].task; + tracing = is_tracing(task); + old_tracing = tracing; + + switch(sig){ + case SIGUSR1: + sig = 0; + op = do_proc_op(task, proc_id); + switch(op){ + case OP_TRACE_ON: + arch_leave_kernel(task, pid); + tracing = 1; + break; + case OP_REBOOT: + case OP_HALT: + unmap_physmem(); + kmalloc_ok = 0; + ptrace(PTRACE_KILL, pid, 0, 0); + return(op == OP_REBOOT); + case OP_NONE: + printf("Detaching pid %d\n", pid); + detach(pid, SIGSTOP); + continue; + default: + break; + } + /* OP_EXEC switches host processes on us, + * we want to continue the new one. + */ + pid = cpu_tasks[proc_id].pid; + break; + case SIGTRAP: + if(!tracing && (debugger_pid != -1)){ + child_signal(pid, status); + continue; + } + tracing = 0; + if(do_syscall(task, pid)) sig = SIGUSR2; + else clear_singlestep(task); + break; + case SIGPROF: + if(tracing) sig = 0; + break; + case SIGCHLD: + case SIGHUP: + sig = 0; + break; + case SIGSEGV: + case SIGIO: + case SIGALRM: + case SIGVTALRM: + case SIGFPE: + case SIGBUS: + case SIGILL: + case SIGWINCH: + default: + tracing = 0; + break; + } + set_tracing(task, tracing); + + if(!tracing && old_tracing) + arch_enter_kernel(task, pid); + + if(!tracing && (debugger_pid != -1) && (sig != 0) && + (sig != SIGALRM) && (sig != SIGVTALRM) && + (sig != SIGSEGV) && (sig != SIGTRAP) && + (sig != SIGUSR2) && (sig != SIGIO) && + (sig != SIGFPE)){ + child_signal(pid, status); + continue; + } + + if(tracing){ + if(singlestepping_tt(task)) + cont_type = PTRACE_SINGLESTEP; + else cont_type = PTRACE_SYSCALL; + } + else cont_type = PTRACE_CONT; + + if((cont_type == PTRACE_CONT) && + (debugger_pid != -1) && strace) + cont_type = PTRACE_SYSCALL; + + if(ptrace(cont_type, pid, 0, sig) != 0){ + tracer_panic("ptrace failed to continue " + "process - errno = %d\n", + errno); + } + } + } + return(0); +} + +static int __init uml_debug_setup(char *line, int *add) +{ + char *next; + + debug = 1; + *add = 0; + if(*line != '=') return(0); + line++; + + while(line != NULL){ + next = strchr(line, ','); + if(next) *next++ = '\0'; + + if(!strcmp(line, "go")) debug_stop = 0; + else if(!strcmp(line, "parent")) debug_parent = 1; + else printk("Unknown debug option : '%s'\n", line); + + line = next; + } + return(0); +} + +__uml_setup("debug", uml_debug_setup, +"debug\n" +" Starts up the kernel under the control of gdb. See the \n" +" kernel debugging tutorial and the debugging session pages\n" +" at http://user-mode-linux.sourceforge.net/ for more information.\n\n" +); + +static int __init uml_debugtrace_setup(char *line, int *add) +{ + debug_trace = 1; + return 0; +} +__uml_setup("debugtrace", uml_debugtrace_setup, +"debugtrace\n" +" Causes the tracing thread to pause until it is attached by a\n" +" debugger and continued. This is mostly for debugging crashes\n" +" early during boot, and should be pretty much obsoleted by\n" +" the debug switch.\n\n" +); + +static int __init uml_honeypot_setup(char *line, int *add) +{ + jail_setup("", add); + honeypot = 1; + return 0; +} +__uml_setup("honeypot", uml_honeypot_setup, +"honeypot\n" +" This makes UML put process stacks in the same location as they are\n" +" on the host, allowing expoits such as stack smashes to work against\n" +" UML. This implies 'jail'.\n\n" +); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/tt/uaccess_user.c b/arch/um/kernel/tt/uaccess_user.c --- a/arch/um/kernel/tt/uaccess_user.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/tt/uaccess_user.c Fri Oct 31 14:10:53 2003 @@ -60,10 +60,10 @@ { int ret; unsigned long *faddrp = (unsigned long *)fault_addr; - jmp_buf jbuf; + sigjmp_buf jbuf; *fault_catcher = &jbuf; - if(setjmp(jbuf) == 0){ + if(sigsetjmp(jbuf,1) == 0){ ret = strlen(str) + 1; } else { diff -Nru a/arch/um/kernel/tt/uaccess_user.c~uml-summa.diff b/arch/um/kernel/tt/uaccess_user.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/tt/uaccess_user.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk) + * Copyright (C) 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <setjmp.h> +#include <string.h> +#include "user_util.h" +#include "uml_uaccess.h" + +int __do_copy_from_user(void *to, const void *from, int n, + void **fault_addr, void **fault_catcher) +{ + unsigned long fault; + int faulted; + + fault = __do_user_copy(to, from, n, fault_addr, fault_catcher, + __do_copy, &faulted); + if(!faulted) return(0); + else return(n - (fault - (unsigned long) from)); +} + +static void __do_strncpy(void *dst, const void *src, int count) +{ + strncpy(dst, src, count); +} + +int __do_strncpy_from_user(char *dst, const char *src, unsigned long count, + void **fault_addr, void **fault_catcher) +{ + unsigned long fault; + int faulted; + + fault = __do_user_copy(dst, src, count, fault_addr, fault_catcher, + __do_strncpy, &faulted); + if(!faulted) return(strlen(dst)); + else return(-1); +} + +static void __do_clear(void *to, const void *from, int n) +{ + memset(to, 0, n); +} + +int __do_clear_user(void *mem, unsigned long len, + void **fault_addr, void **fault_catcher) +{ + unsigned long fault; + int faulted; + + fault = __do_user_copy(mem, NULL, len, fault_addr, fault_catcher, + __do_clear, &faulted); + if(!faulted) return(0); + else return(len - (fault - (unsigned long) mem)); +} + +int __do_strnlen_user(const char *str, unsigned long n, + void **fault_addr, void **fault_catcher) +{ + int ret; + unsigned long *faddrp = (unsigned long *)fault_addr; + jmp_buf jbuf; + + *fault_catcher = &jbuf; + if(setjmp(jbuf) == 0){ + ret = strlen(str) + 1; + } + else { + ret = *faddrp - (unsigned long) str; + } + *fault_addr = NULL; + *fault_catcher = NULL; + return ret; +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/tty_log.c b/arch/um/kernel/tty_log.c --- a/arch/um/kernel/tty_log.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/tty_log.c Fri Oct 31 14:10:54 2003 @@ -13,6 +13,7 @@ #include <sys/time.h> #include "init.h" #include "user.h" +#include "kern_util.h" #include "os.h" #define TTY_LOG_DIR "./" @@ -24,29 +25,40 @@ #define TTY_LOG_OPEN 1 #define TTY_LOG_CLOSE 2 #define TTY_LOG_WRITE 3 +#define TTY_LOG_EXEC 4 + +#define TTY_READ 1 +#define TTY_WRITE 2 struct tty_log_buf { int what; unsigned long tty; int len; + int direction; + unsigned long sec; + unsigned long usec; }; -int open_tty_log(void *tty) +int open_tty_log(void *tty, void *current_tty) { struct timeval tv; struct tty_log_buf data; char buf[strlen(tty_log_dir) + sizeof("01234567890-01234567\0")]; int fd; + gettimeofday(&tv, NULL); if(tty_log_fd != -1){ - data = ((struct tty_log_buf) { what : TTY_LOG_OPEN, - tty : (unsigned long) tty, - len : 0 }); + data = ((struct tty_log_buf) { .what = TTY_LOG_OPEN, + .tty = (unsigned long) tty, + .len = sizeof(current_tty), + .direction = 0, + .sec = tv.tv_sec, + .usec = tv.tv_usec } ); write(tty_log_fd, &data, sizeof(data)); + write(tty_log_fd, ¤t_tty, data.len); return(tty_log_fd); } - gettimeofday(&tv, NULL); sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, (unsigned int) tv.tv_usec); @@ -62,30 +74,114 @@ void close_tty_log(int fd, void *tty) { struct tty_log_buf data; + struct timeval tv; if(tty_log_fd != -1){ - data = ((struct tty_log_buf) { what : TTY_LOG_CLOSE, - tty : (unsigned long) tty, - len : 0 }); + gettimeofday(&tv, NULL); + data = ((struct tty_log_buf) { .what = TTY_LOG_CLOSE, + .tty = (unsigned long) tty, + .len = 0, + .direction = 0, + .sec = tv.tv_sec, + .usec = tv.tv_usec } ); write(tty_log_fd, &data, sizeof(data)); return; } close(fd); } -int write_tty_log(int fd, char *buf, int len, void *tty) +static int log_chunk(int fd, const char *buf, int len) { + int total = 0, try, missed, n; + char chunk[64]; + + while(len > 0){ + try = (len > sizeof(chunk)) ? sizeof(chunk) : len; + missed = copy_from_user_proc(chunk, (char *) buf, try); + try -= missed; + n = write(fd, chunk, try); + if(n != try) + return(-errno); + if(missed != 0) + return(-EFAULT); + + len -= try; + total += try; + buf += try; + } + + return(total); +} + +int write_tty_log(int fd, const char *buf, int len, void *tty, int is_read) +{ + struct timeval tv; struct tty_log_buf data; + int direction; if(fd == tty_log_fd){ - data = ((struct tty_log_buf) { what : TTY_LOG_WRITE, - tty : (unsigned long) tty, - len : len }); + gettimeofday(&tv, NULL); + direction = is_read ? TTY_READ : TTY_WRITE; + data = ((struct tty_log_buf) { .what = TTY_LOG_WRITE, + .tty = (unsigned long) tty, + .len = len, + .direction = direction, + .sec = tv.tv_sec, + .usec = tv.tv_usec } ); write(tty_log_fd, &data, sizeof(data)); } - return(write(fd, buf, len)); + + return(log_chunk(fd, buf, len)); } +void log_exec(char **argv, void *tty) +{ + struct timeval tv; + struct tty_log_buf data; + char **ptr,*arg; + int len; + + if(tty_log_fd == -1) return; + + gettimeofday(&tv, NULL); + + len = 0; + for(ptr = argv; ; ptr++){ + if(copy_from_user_proc(&arg, ptr, sizeof(arg))) + return; + if(arg == NULL) break; + len += strlen_user_proc(arg); + } + + data = ((struct tty_log_buf) { .what = TTY_LOG_EXEC, + .tty = (unsigned long) tty, + .len = len, + .direction = 0, + .sec = tv.tv_sec, + .usec = tv.tv_usec } ); + write(tty_log_fd, &data, sizeof(data)); + + for(ptr = argv; ; ptr++){ + if(copy_from_user_proc(&arg, ptr, sizeof(arg))) + return; + if(arg == NULL) break; + log_chunk(tty_log_fd, arg, strlen_user_proc(arg)); + } +} + +extern void register_tty_logger(int (*opener)(void *, void *), + int (*writer)(int, const char *, int, + void *, int), + void (*closer)(int, void *)); + +static int register_logger(void) +{ + register_tty_logger(open_tty_log, write_tty_log, close_tty_log); + return(0); +} + +__uml_initcall(register_logger); + static int __init set_tty_log_dir(char *name, int *add) { tty_log_dir = name; @@ -104,7 +200,7 @@ tty_log_fd = strtoul(name, &end, 0); if((*end != '\0') || (end == name)){ - printk("set_tty_log_fd - strtoul failed on '%s'\n", name); + printf("set_tty_log_fd - strtoul failed on '%s'\n", name); tty_log_fd = -1; } return 0; diff -Nru a/arch/um/kernel/tty_log.c~uml-summa.diff b/arch/um/kernel/tty_log.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/tty_log.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) and + * geoffrey hing <ghing@net.ohio-state.edu> + * Licensed under the GPL + */ + +#include <errno.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/time.h> +#include "init.h" +#include "user.h" +#include "os.h" + +#define TTY_LOG_DIR "./" + +/* Set early in boot and then unchanged */ +static char *tty_log_dir = TTY_LOG_DIR; +static int tty_log_fd = -1; + +#define TTY_LOG_OPEN 1 +#define TTY_LOG_CLOSE 2 +#define TTY_LOG_WRITE 3 + +struct tty_log_buf { + int what; + unsigned long tty; + int len; +}; + +int open_tty_log(void *tty) +{ + struct timeval tv; + struct tty_log_buf data; + char buf[strlen(tty_log_dir) + sizeof("01234567890-01234567\0")]; + int fd; + + if(tty_log_fd != -1){ + data = ((struct tty_log_buf) { what : TTY_LOG_OPEN, + tty : (unsigned long) tty, + len : 0 }); + write(tty_log_fd, &data, sizeof(data)); + return(tty_log_fd); + } + + gettimeofday(&tv, NULL); + sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, + (unsigned int) tv.tv_usec); + + fd = os_open_file(buf, of_append(of_create(of_rdwr(OPENFLAGS()))), + 0644); + if(fd < 0){ + printk("open_tty_log : couldn't open '%s', errno = %d\n", + buf, -fd); + } + return(fd); +} + +void close_tty_log(int fd, void *tty) +{ + struct tty_log_buf data; + + if(tty_log_fd != -1){ + data = ((struct tty_log_buf) { what : TTY_LOG_CLOSE, + tty : (unsigned long) tty, + len : 0 }); + write(tty_log_fd, &data, sizeof(data)); + return; + } + close(fd); +} + +int write_tty_log(int fd, char *buf, int len, void *tty) +{ + struct tty_log_buf data; + + if(fd == tty_log_fd){ + data = ((struct tty_log_buf) { what : TTY_LOG_WRITE, + tty : (unsigned long) tty, + len : len }); + write(tty_log_fd, &data, sizeof(data)); + } + return(write(fd, buf, len)); +} + +static int __init set_tty_log_dir(char *name, int *add) +{ + tty_log_dir = name; + return 0; +} + +__uml_setup("tty_log_dir=", set_tty_log_dir, +"tty_log_dir=<directory>\n" +" This is used to specify the directory where the logs of all pty\n" +" data from this UML machine will be written.\n\n" +); + +static int __init set_tty_log_fd(char *name, int *add) +{ + char *end; + + tty_log_fd = strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ + printk("set_tty_log_fd - strtoul failed on '%s'\n", name); + tty_log_fd = -1; + } + return 0; +} + +__uml_setup("tty_log_fd=", set_tty_log_fd, +"tty_log_fd=<fd>\n" +" This is used to specify a preconfigured file descriptor to which all\n" +" tty data will be written. Preconfigure the descriptor with something\n" +" like '10>tty_log tty_log_fd=10'.\n\n" +); + + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/uaccess_user.c b/arch/um/kernel/uaccess_user.c --- a/arch/um/kernel/uaccess_user.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/uaccess_user.c Fri Oct 31 14:10:54 2003 @@ -18,9 +18,9 @@ { unsigned long *faddrp = (unsigned long *) fault_addr, ret; - jmp_buf jbuf; + sigjmp_buf jbuf; *fault_catcher = &jbuf; - if(setjmp(jbuf) == 0){ + if(sigsetjmp(jbuf,1) == 0){ (*op)(to, from, n); ret = 0; *faulted_out = 0; diff -Nru a/arch/um/kernel/uaccess_user.c~uml-summa.diff b/arch/um/kernel/uaccess_user.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/uaccess_user.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2001 Chris Emerson (cemerson@chiark.greenend.org.uk) + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <setjmp.h> +#include <string.h> + +/* These are here rather than tt/uaccess.c because skas mode needs them in + * order to do SIGBUS recovery when a tmpfs mount runs out of room. + */ + +unsigned long __do_user_copy(void *to, const void *from, int n, + void **fault_addr, void **fault_catcher, + void (*op)(void *to, const void *from, + int n), int *faulted_out) +{ + unsigned long *faddrp = (unsigned long *) fault_addr, ret; + + jmp_buf jbuf; + *fault_catcher = &jbuf; + if(setjmp(jbuf) == 0){ + (*op)(to, from, n); + ret = 0; + *faulted_out = 0; + } + else { + ret = *faddrp; + *faulted_out = 1; + } + *fault_addr = NULL; + *fault_catcher = NULL; + return ret; +} + +void __do_copy(void *to, const void *from, int n) +{ + memcpy(to, from, n); +} + + +int __do_copy_to_user(void *to, const void *from, int n, + void **fault_addr, void **fault_catcher) +{ + unsigned long fault; + int faulted; + + fault = __do_user_copy(to, from, n, fault_addr, fault_catcher, + __do_copy, &faulted); + if(!faulted) return(0); + else return(n - (fault - (unsigned long) to)); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c --- a/arch/um/kernel/um_arch.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/um_arch.c Fri Oct 31 14:10:54 2003 @@ -38,13 +38,18 @@ #include "mode_kern.h" #include "mode.h" -#define DEFAULT_COMMAND_LINE "root=6200" +#define DEFAULT_COMMAND_LINE "root=ubd0" struct cpuinfo_um boot_cpu_data = { .loops_per_jiffy = 0, .ipi_pipe = { -1, -1 } }; +/* Placeholder to make UML link until the vsyscall stuff is actually + * implemented + */ +void *__kernel_vsyscall; + unsigned long thread_saved_pc(struct task_struct *task) { return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas, @@ -61,10 +66,14 @@ return 0; #endif - seq_printf(m, "bogomips\t: %lu.%02lu\n", + seq_printf(m, "processor\t: %d\n", index); + seq_printf(m, "vendor_id\t: User Mode Linux\n"); + seq_printf(m, "model name\t: UML\n"); + seq_printf(m, "mode\t\t: %s\n", CHOOSE_MODE("tt", "skas")); + seq_printf(m, "host\t\t: %s\n", host_info); + seq_printf(m, "bogomips\t: %lu.%02lu\n\n", loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100); - seq_printf(m, "host\t\t: %s\n", host_info); return(0); } @@ -134,12 +143,12 @@ if(umid != NULL){ snprintf(argv1_begin, (argv1_end - argv1_begin) * sizeof(*ptr), - "(%s)", umid); + "(%s) ", umid); ptr = &argv1_begin[strlen(argv1_begin)]; } else ptr = argv1_begin; - snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), " [%s]", cmd); + snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), "[%s]", cmd); memset(argv1_begin + strlen(argv1_begin), '\0', argv1_end - argv1_begin - strlen(argv1_begin)); #endif @@ -179,7 +188,7 @@ static int __init uml_ncpus_setup(char *line, int *add) { if (!sscanf(line, "%d", &ncpus)) { - printk("Couldn't parse [%s]\n", line); + printf("Couldn't parse [%s]\n", line); return -1; } @@ -210,7 +219,7 @@ static int __init mode_tt_setup(char *line, int *add) { - printk("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n"); + printf("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n"); return(0); } @@ -221,7 +230,7 @@ static int __init mode_tt_setup(char *line, int *add) { - printk("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n"); + printf("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n"); return(0); } @@ -369,6 +378,7 @@ 2 * PAGE_SIZE; task_protections((unsigned long) &init_thread_info); + os_flush_stdout(); return(CHOOSE_MODE(start_uml_tt(), start_uml_skas())); } diff -Nru a/arch/um/kernel/um_arch.c~uml-summa.diff b/arch/um/kernel/um_arch.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/um_arch.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,417 @@ +/* + * Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include "linux/config.h" +#include "linux/kernel.h" +#include "linux/sched.h" +#include "linux/notifier.h" +#include "linux/mm.h" +#include "linux/types.h" +#include "linux/tty.h" +#include "linux/init.h" +#include "linux/bootmem.h" +#include "linux/spinlock.h" +#include "linux/utsname.h" +#include "linux/sysrq.h" +#include "linux/seq_file.h" +#include "linux/delay.h" +#include "asm/page.h" +#include "asm/pgtable.h" +#include "asm/ptrace.h" +#include "asm/elf.h" +#include "asm/user.h" +#include "ubd_user.h" +#include "asm/current.h" +#include "user_util.h" +#include "kern_util.h" +#include "kern.h" +#include "mprot.h" +#include "mem_user.h" +#include "mem.h" +#include "umid.h" +#include "initrd.h" +#include "init.h" +#include "os.h" +#include "choose-mode.h" +#include "mode_kern.h" +#include "mode.h" + +#define DEFAULT_COMMAND_LINE "root=6200" + +struct cpuinfo_um boot_cpu_data = { + .loops_per_jiffy = 0, + .ipi_pipe = { -1, -1 } +}; + +unsigned long thread_saved_pc(struct task_struct *task) +{ + return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas, + task))); +} + +static int show_cpuinfo(struct seq_file *m, void *v) +{ + int index; + + index = (struct cpuinfo_um *)v - cpu_data; +#ifdef CONFIG_SMP + if (!cpu_online(index)) + return 0; +#endif + + seq_printf(m, "bogomips\t: %lu.%02lu\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100); + seq_printf(m, "host\t\t: %s\n", host_info); + + return(0); +} + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + return *pos < NR_CPUS ? cpu_data + *pos : NULL; +} + +static void *c_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return c_start(m, pos); +} + +static void c_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations cpuinfo_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = show_cpuinfo, +}; + +pte_t * __bad_pagetable(void) +{ + panic("Someone should implement __bad_pagetable"); + return(NULL); +} + +/* Set in linux_main */ +unsigned long host_task_size; +unsigned long task_size; + +unsigned long uml_start; + +/* Set in early boot */ +unsigned long uml_physmem; +unsigned long uml_reserved; +unsigned long start_vm; +unsigned long end_vm; +int ncpus = 1; + +#ifdef CONFIG_MODE_TT +/* Pointer set in linux_main, the array itself is private to each thread, + * and changed at address space creation time so this poses no concurrency + * problems. + */ +static char *argv1_begin = NULL; +static char *argv1_end = NULL; +#endif + +/* Set in early boot */ +static int have_root __initdata = 0; +long physmem_size = 32 * 1024 * 1024; + +void set_cmdline(char *cmd) +{ +#ifdef CONFIG_MODE_TT + char *umid, *ptr; + + if(CHOOSE_MODE(honeypot, 0)) return; + + umid = get_umid(1); + if(umid != NULL){ + snprintf(argv1_begin, + (argv1_end - argv1_begin) * sizeof(*ptr), + "(%s)", umid); + ptr = &argv1_begin[strlen(argv1_begin)]; + } + else ptr = argv1_begin; + + snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), " [%s]", cmd); + memset(argv1_begin + strlen(argv1_begin), '\0', + argv1_end - argv1_begin - strlen(argv1_begin)); +#endif +} + +static char *usage_string = +"User Mode Linux v%s\n" +" available at http://user-mode-linux.sourceforge.net/\n\n"; + +static int __init uml_version_setup(char *line, int *add) +{ + printf("%s\n", system_utsname.release); + exit(0); +} + +__uml_setup("--version", uml_version_setup, +"--version\n" +" Prints the version number of the kernel.\n\n" +); + +static int __init uml_root_setup(char *line, int *add) +{ + have_root = 1; + return 0; +} + +__uml_setup("root=", uml_root_setup, +"root=<file containing the root fs>\n" +" This is actually used by the generic kernel in exactly the same\n" +" way as in any other kernel. If you configure a number of block\n" +" devices and want to boot off something other than ubd0, you \n" +" would use something like:\n" +" root=/dev/ubd5\n\n" +); + +#ifdef CONFIG_SMP +static int __init uml_ncpus_setup(char *line, int *add) +{ + if (!sscanf(line, "%d", &ncpus)) { + printk("Couldn't parse [%s]\n", line); + return -1; + } + + return 0; +} + +__uml_setup("ncpus=", uml_ncpus_setup, +"ncpus=<# of desired CPUs>\n" +" This tells an SMP kernel how many virtual processors to start.\n\n" +); +#endif + +int force_tt = 0; + +#if defined(CONFIG_MODE_TT) && defined(CONFIG_MODE_SKAS) +#define DEFAULT_TT 0 + +static int __init mode_tt_setup(char *line, int *add) +{ + force_tt = 1; + return(0); +} + +#else +#ifdef CONFIG_MODE_SKAS + +#define DEFAULT_TT 0 + +static int __init mode_tt_setup(char *line, int *add) +{ + printk("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n"); + return(0); +} + +#else +#ifdef CONFIG_MODE_TT + +#define DEFAULT_TT 1 + +static int __init mode_tt_setup(char *line, int *add) +{ + printk("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n"); + return(0); +} + +#else + +#error Either CONFIG_MODE_TT or CONFIG_MODE_SKAS must be enabled + +#endif +#endif +#endif + +__uml_setup("mode=tt", mode_tt_setup, +"mode=tt\n" +" When both CONFIG_MODE_TT and CONFIG_MODE_SKAS are enabled, this option\n" +" forces UML to run in tt (tracing thread) mode. It is not the default\n" +" because it's slower and less secure than skas mode.\n\n" +); + +int mode_tt = DEFAULT_TT; + +static int __init Usage(char *line, int *add) +{ + const char **p; + + printf(usage_string, system_utsname.release); + p = &__uml_help_start; + while (p < &__uml_help_end) { + printf("%s", *p); + p++; + } + exit(0); +} + +__uml_setup("--help", Usage, +"--help\n" +" Prints this message.\n\n" +); + +static int __init uml_checksetup(char *line, int *add) +{ + struct uml_param *p; + + p = &__uml_setup_start; + while(p < &__uml_setup_end) { + int n; + + n = strlen(p->str); + if(!strncmp(line, p->str, n)){ + if (p->setup_func(line + n, add)) return 1; + } + p++; + } + return 0; +} + +static void __init uml_postsetup(void) +{ + initcall_t *p; + + p = &__uml_postsetup_start; + while(p < &__uml_postsetup_end){ + (*p)(); + p++; + } + return; +} + +/* Set during early boot */ +unsigned long brk_start; +static struct vm_reserved kernel_vm_reserved; + +#define MIN_VMALLOC (32 * 1024 * 1024) + +int linux_main(int argc, char **argv) +{ + unsigned long avail; + unsigned long virtmem_size, max_physmem; + unsigned int i, add, err; + + for (i = 1; i < argc; i++){ + if((i == 1) && (argv[i][0] == ' ')) continue; + add = 1; + uml_checksetup(argv[i], &add); + if(add) add_arg(saved_command_line, argv[i]); + } + if(have_root == 0) add_arg(saved_command_line, DEFAULT_COMMAND_LINE); + + mode_tt = force_tt ? 1 : !can_do_skas(); + uml_start = CHOOSE_MODE_PROC(set_task_sizes_tt, set_task_sizes_skas, 0, + &host_task_size, &task_size); + + brk_start = (unsigned long) sbrk(0); + CHOOSE_MODE_PROC(before_mem_tt, before_mem_skas, brk_start); + + uml_physmem = uml_start; + + /* Reserve up to 4M after the current brk */ + uml_reserved = ROUND_4M(brk_start) + (1 << 22); + + setup_machinename(system_utsname.machine); + +#ifdef CONFIG_MODE_TT + argv1_begin = argv[1]; + argv1_end = &argv[1][strlen(argv[1])]; +#endif + + set_usable_vm(uml_physmem, get_kmem_end()); + + highmem = 0; + max_physmem = get_kmem_end() - uml_physmem - MIN_VMALLOC; + if(physmem_size > max_physmem){ + highmem = physmem_size - max_physmem; + physmem_size -= highmem; +#ifndef CONFIG_HIGHMEM + highmem = 0; + printf("CONFIG_HIGHMEM not enabled - physical memory shrunk " + "to %ld bytes\n", physmem_size); +#endif + } + + high_physmem = uml_physmem + physmem_size; + high_memory = (void *) high_physmem; + + start_vm = VMALLOC_START; + + setup_physmem(uml_physmem, uml_reserved, physmem_size); + virtmem_size = physmem_size; + avail = get_kmem_end() - start_vm; + if(physmem_size > avail) virtmem_size = avail; + end_vm = start_vm + virtmem_size; + + if(virtmem_size < physmem_size) + printf("Kernel virtual memory size shrunk to %ld bytes\n", + virtmem_size); + + err = reserve_vm(high_physmem, end_vm, &kernel_vm_reserved); + if(err){ + printf("Failed to reserve VM area for kernel VM\n"); + exit(1); + } + + uml_postsetup(); + + init_task.thread.kernel_stack = (unsigned long) &init_thread_info + + 2 * PAGE_SIZE; + + task_protections((unsigned long) &init_thread_info); + + return(CHOOSE_MODE(start_uml_tt(), start_uml_skas())); +} + +static int panic_exit(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ +#ifdef CONFIG_SYSRQ + handle_sysrq('p', ¤t->thread.regs, NULL, NULL); +#endif + machine_halt(); + return(0); +} + +static struct notifier_block panic_exit_notifier = { + .notifier_call = panic_exit, + .next = NULL, + .priority = 0 +}; + +void __init setup_arch(char **cmdline_p) +{ + notifier_chain_register(&panic_notifier_list, &panic_exit_notifier); + paging_init(); + strcpy(command_line, saved_command_line); + *cmdline_p = command_line; + setup_hostinfo(); +} + +void __init check_bugs(void) +{ + arch_check_bugs(); + check_ptrace(); + check_sigio(); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/umid.c b/arch/um/kernel/umid.c --- a/arch/um/kernel/umid.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/umid.c Fri Oct 31 14:10:54 2003 @@ -33,18 +33,19 @@ static int umid_is_random = 1; static int umid_inited = 0; -static int make_umid(void); +static int make_umid(int (*printer)(const char *fmt, ...)); -static int __init set_umid(char *name, int is_random) +static int __init set_umid(char *name, int is_random, + int (*printer)(const char *fmt, ...)) { if(umid_inited){ - printk("Unique machine name can't be set twice\n"); + (*printer)("Unique machine name can't be set twice\n"); return(-1); } if(strlen(name) > UMID_LEN - 1) - printk("Unique machine name is being truncated to %s " - "characters\n", UMID_LEN); + (*printer)("Unique machine name is being truncated to %s " + "characters\n", UMID_LEN); strlcpy(umid, name, sizeof(umid)); umid_is_random = is_random; @@ -54,7 +55,7 @@ static int __init set_umid_arg(char *name, int *add) { - return(set_umid(name, 0)); + return(set_umid(name, 0, printf)); } __uml_setup("umid=", set_umid_arg, @@ -67,7 +68,7 @@ { int n; - if(!umid_inited && make_umid()) return(-1); + if(!umid_inited && make_umid(printk)) return(-1); n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1; if(n > len){ @@ -92,14 +93,14 @@ fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))), 0644); if(fd < 0){ - printk("Open of machine pid file \"%s\" failed - " + printf("Open of machine pid file \"%s\" failed - " "errno = %d\n", file, -fd); return 0; } sprintf(pid, "%d\n", os_getpid()); if(write(fd, pid, strlen(pid)) != strlen(pid)) - printk("Write of pid file failed - errno = %d\n", errno); + printf("Write of pid file failed - errno = %d\n", errno); close(fd); return 0; } @@ -197,7 +198,7 @@ if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){ uml_dir = malloc(strlen(name) + 1); if(uml_dir == NULL){ - printk("Failed to malloc uml_dir - error = %d\n", + printf("Failed to malloc uml_dir - error = %d\n", errno); uml_dir = name; return(0); @@ -217,7 +218,7 @@ char *home = getenv("HOME"); if(home == NULL){ - printk("make_uml_dir : no value in environment for " + printf("make_uml_dir : no value in environment for " "$HOME\n"); exit(1); } @@ -239,25 +240,25 @@ strcpy(uml_dir, dir); if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){ - printk("Failed to mkdir %s - errno = %i\n", uml_dir, errno); + printf("Failed to mkdir %s - errno = %i\n", uml_dir, errno); return(-1); } return 0; } -static int __init make_umid(void) +static int __init make_umid(int (*printer)(const char *fmt, ...)) { int fd, err; char tmp[strlen(uml_dir) + UMID_LEN + 1]; strlcpy(tmp, uml_dir, sizeof(tmp)); - if(*umid == 0){ + if(!umid_inited){ strcat(tmp, "XXXXXX"); fd = mkstemp(tmp); if(fd < 0){ - printk("make_umid - mkstemp failed, errno = %d\n", - errno); + (*printer)("make_umid - mkstemp failed, errno = %d\n", + errno); return(1); } @@ -267,7 +268,7 @@ * for directories. */ unlink(tmp); - set_umid(&tmp[strlen(uml_dir)], 1); + set_umid(&tmp[strlen(uml_dir)], 1, printer); } sprintf(tmp, "%s%s", uml_dir, umid); @@ -275,14 +276,14 @@ if((err = mkdir(tmp, 0777)) < 0){ if(errno == EEXIST){ if(not_dead_yet(tmp)){ - printk("umid '%s' is in use\n", umid); + (*printer)("umid '%s' is in use\n", umid); return(-1); } err = mkdir(tmp, 0777); } } if(err < 0){ - printk("Failed to create %s - errno = %d\n", umid, errno); + (*printer)("Failed to create %s - errno = %d\n", umid, errno); return(-1); } @@ -295,7 +296,13 @@ ); __uml_postsetup(make_uml_dir); -__uml_postsetup(make_umid); + +static int __init make_umid_setup(void) +{ + return(make_umid(printf)); +} + +__uml_postsetup(make_umid_setup); __uml_postsetup(create_pid_file); /* diff -Nru a/arch/um/kernel/umid.c~uml-summa.diff b/arch/um/kernel/umid.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/umid.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,310 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> +#include <dirent.h> +#include <signal.h> +#include <sys/stat.h> +#include <sys/param.h> +#include "user.h" +#include "umid.h" +#include "init.h" +#include "os.h" +#include "user_util.h" +#include "choose-mode.h" + +#define UMID_LEN 64 +#define UML_DIR "~/.uml/" + +/* Changed by set_umid and make_umid, which are run early in boot */ +static char umid[UMID_LEN] = { 0 }; + +/* Changed by set_uml_dir and make_uml_dir, which are run early in boot */ +static char *uml_dir = UML_DIR; + +/* Changed by set_umid */ +static int umid_is_random = 1; +static int umid_inited = 0; + +static int make_umid(void); + +static int __init set_umid(char *name, int is_random) +{ + if(umid_inited){ + printk("Unique machine name can't be set twice\n"); + return(-1); + } + + if(strlen(name) > UMID_LEN - 1) + printk("Unique machine name is being truncated to %s " + "characters\n", UMID_LEN); + strlcpy(umid, name, sizeof(umid)); + + umid_is_random = is_random; + umid_inited = 1; + return 0; +} + +static int __init set_umid_arg(char *name, int *add) +{ + return(set_umid(name, 0)); +} + +__uml_setup("umid=", set_umid_arg, +"umid=<name>\n" +" This is used to assign a unique identity to this UML machine and\n" +" is used for naming the pid file and management console socket.\n\n" +); + +int __init umid_file_name(char *name, char *buf, int len) +{ + int n; + + if(!umid_inited && make_umid()) return(-1); + + n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1; + if(n > len){ + printk("umid_file_name : buffer too short\n"); + return(-1); + } + + sprintf(buf, "%s%s/%s", uml_dir, umid, name); + return(0); +} + +extern int tracing_pid; + +static int __init create_pid_file(void) +{ + char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")]; + char pid[sizeof("nnnnn\0")]; + int fd; + + if(umid_file_name("pid", file, sizeof(file))) return 0; + + fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))), + 0644); + if(fd < 0){ + printk("Open of machine pid file \"%s\" failed - " + "errno = %d\n", file, -fd); + return 0; + } + + sprintf(pid, "%d\n", os_getpid()); + if(write(fd, pid, strlen(pid)) != strlen(pid)) + printk("Write of pid file failed - errno = %d\n", errno); + close(fd); + return 0; +} + +static int actually_do_remove(char *dir) +{ + DIR *directory; + struct dirent *ent; + int len; + char file[256]; + + if((directory = opendir(dir)) == NULL){ + printk("actually_do_remove : couldn't open directory '%s', " + "errno = %d\n", dir, errno); + return(1); + } + while((ent = readdir(directory)) != NULL){ + if(!strcmp(ent->d_name, ".") || !strcmp(ent->d_name, "..")) + continue; + len = strlen(dir) + sizeof("/") + strlen(ent->d_name) + 1; + if(len > sizeof(file)){ + printk("Not deleting '%s' from '%s' - name too long\n", + ent->d_name, dir); + continue; + } + sprintf(file, "%s/%s", dir, ent->d_name); + if(unlink(file) < 0){ + printk("actually_do_remove : couldn't remove '%s' " + "from '%s', errno = %d\n", ent->d_name, dir, + errno); + return(1); + } + } + if(rmdir(dir) < 0){ + printk("actually_do_remove : couldn't rmdir '%s', " + "errno = %d\n", dir, errno); + return(1); + } + return(0); +} + +void remove_umid_dir(void) +{ + char dir[strlen(uml_dir) + UMID_LEN + 1]; + if(!umid_inited) return; + + sprintf(dir, "%s%s", uml_dir, umid); + actually_do_remove(dir); +} + +char *get_umid(int only_if_set) +{ + if(only_if_set && umid_is_random) return(NULL); + return(umid); +} + +int not_dead_yet(char *dir) +{ + char file[strlen(uml_dir) + UMID_LEN + sizeof("/pid\0")]; + char pid[sizeof("nnnnn\0")], *end; + int dead, fd, p; + + sprintf(file, "%s/pid", dir); + dead = 0; + if((fd = os_open_file(file, of_read(OPENFLAGS()), 0)) < 0){ + if(fd != -ENOENT){ + printk("not_dead_yet : couldn't open pid file '%s', " + "errno = %d\n", file, -fd); + return(1); + } + dead = 1; + } + if(fd > 0){ + if(read(fd, pid, sizeof(pid)) < 0){ + printk("not_dead_yet : couldn't read pid file '%s', " + "errno = %d\n", file, errno); + return(1); + } + p = strtoul(pid, &end, 0); + if(end == pid){ + printk("not_dead_yet : couldn't parse pid file '%s', " + "errno = %d\n", file, errno); + dead = 1; + } + if(((kill(p, 0) < 0) && (errno == ESRCH)) || + (p == CHOOSE_MODE(tracing_pid, os_getpid()))) + dead = 1; + } + if(!dead) return(1); + return(actually_do_remove(dir)); +} + +static int __init set_uml_dir(char *name, int *add) +{ + if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){ + uml_dir = malloc(strlen(name) + 1); + if(uml_dir == NULL){ + printk("Failed to malloc uml_dir - error = %d\n", + errno); + uml_dir = name; + return(0); + } + sprintf(uml_dir, "%s/", name); + } + else uml_dir = name; + return 0; +} + +static int __init make_uml_dir(void) +{ + char dir[MAXPATHLEN + 1] = { '\0' }; + int len; + + if(*uml_dir == '~'){ + char *home = getenv("HOME"); + + if(home == NULL){ + printk("make_uml_dir : no value in environment for " + "$HOME\n"); + exit(1); + } + strlcpy(dir, home, sizeof(dir)); + uml_dir++; + } + len = strlen(dir); + strncat(dir, uml_dir, sizeof(dir) - len); + len = strlen(dir); + if((len > 0) && (len < sizeof(dir) - 1) && (dir[len - 1] != '/')){ + dir[len] = '/'; + dir[len + 1] = '\0'; + } + + if((uml_dir = malloc(strlen(dir) + 1)) == NULL){ + printf("make_uml_dir : malloc failed, errno = %d\n", errno); + exit(1); + } + strcpy(uml_dir, dir); + + if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){ + printk("Failed to mkdir %s - errno = %i\n", uml_dir, errno); + return(-1); + } + return 0; +} + +static int __init make_umid(void) +{ + int fd, err; + char tmp[strlen(uml_dir) + UMID_LEN + 1]; + + strlcpy(tmp, uml_dir, sizeof(tmp)); + + if(*umid == 0){ + strcat(tmp, "XXXXXX"); + fd = mkstemp(tmp); + if(fd < 0){ + printk("make_umid - mkstemp failed, errno = %d\n", + errno); + return(1); + } + + close(fd); + /* There's a nice tiny little race between this unlink and + * the mkdir below. It'd be nice if there were a mkstemp + * for directories. + */ + unlink(tmp); + set_umid(&tmp[strlen(uml_dir)], 1); + } + + sprintf(tmp, "%s%s", uml_dir, umid); + + if((err = mkdir(tmp, 0777)) < 0){ + if(errno == EEXIST){ + if(not_dead_yet(tmp)){ + printk("umid '%s' is in use\n", umid); + return(-1); + } + err = mkdir(tmp, 0777); + } + } + if(err < 0){ + printk("Failed to create %s - errno = %d\n", umid, errno); + return(-1); + } + + return(0); +} + +__uml_setup("uml_dir=", set_uml_dir, +"uml_dir=<directory>\n" +" The location to place the pid and umid files.\n\n" +); + +__uml_postsetup(make_uml_dir); +__uml_postsetup(make_umid); +__uml_postsetup(create_pid_file); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/kernel/user_syms.c b/arch/um/kernel/user_syms.c --- a/arch/um/kernel/user_syms.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/kernel/user_syms.c Fri Oct 31 14:10:54 2003 @@ -27,7 +27,7 @@ #define __MODULE_STRING_1(x) #x #define __MODULE_STRING(x) __MODULE_STRING_1(x) -#if !defined(__AUTOCONF_INCLUDED__) +#if !defined(AUTOCONF_INCLUDED) #define __EXPORT_SYMBOL(sym,str) error config_must_be_included_before_module #define EXPORT_SYMBOL(var) error config_must_be_included_before_module diff -Nru a/arch/um/kernel/user_syms.c~uml-AUTOCONF_INCLUDED.diff b/arch/um/kernel/user_syms.c~uml-AUTOCONF_INCLUDED.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/user_syms.c~uml-AUTOCONF_INCLUDED.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,113 @@ +#include <stdio.h> +#include <unistd.h> +#include <fcntl.h> +#include <dirent.h> +#include <errno.h> +#include <utime.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/vfs.h> +#include <sys/ioctl.h> +#include "user_util.h" +#include "mem_user.h" +#include "uml-config.h" + +/* Had to steal this from linux/module.h because that file can't be included + * since this includes various user-level headers. + */ + +struct module_symbol +{ + unsigned long value; + const char *name; +}; + +/* Indirect stringification. */ + +#define __MODULE_STRING_1(x) #x +#define __MODULE_STRING(x) __MODULE_STRING_1(x) + +#if !defined(__AUTOCONF_INCLUDED__) + +#define __EXPORT_SYMBOL(sym,str) error config_must_be_included_before_module +#define EXPORT_SYMBOL(var) error config_must_be_included_before_module +#define EXPORT_SYMBOL_NOVERS(var) error config_must_be_included_before_module + +#elif !defined(UML_CONFIG_MODULES) + +#define __EXPORT_SYMBOL(sym,str) +#define EXPORT_SYMBOL(var) +#define EXPORT_SYMBOL_NOVERS(var) + +#else + +#define __EXPORT_SYMBOL(sym, str) \ +const char __kstrtab_##sym[] \ +__attribute__((section(".kstrtab"))) = str; \ +const struct module_symbol __ksymtab_##sym \ +__attribute__((section("__ksymtab"))) = \ +{ (unsigned long)&sym, __kstrtab_##sym } + +#if defined(__MODVERSIONS__) || !defined(UML_CONFIG_MODVERSIONS) +#define EXPORT_SYMBOL(var) __EXPORT_SYMBOL(var, __MODULE_STRING(var)) +#else +#define EXPORT_SYMBOL(var) __EXPORT_SYMBOL(var, __MODULE_STRING(__VERSIONED_SYMBOL(var))) +#endif + +#define EXPORT_SYMBOL_NOVERS(var) __EXPORT_SYMBOL(var, __MODULE_STRING(var)) + +#endif + +EXPORT_SYMBOL(__errno_location); + +EXPORT_SYMBOL(access); +EXPORT_SYMBOL(open); +EXPORT_SYMBOL(open64); +EXPORT_SYMBOL(close); +EXPORT_SYMBOL(read); +EXPORT_SYMBOL(write); +EXPORT_SYMBOL(dup2); +EXPORT_SYMBOL(__xstat); +EXPORT_SYMBOL(__lxstat); +EXPORT_SYMBOL(__lxstat64); +EXPORT_SYMBOL(lseek); +EXPORT_SYMBOL(lseek64); +EXPORT_SYMBOL(chown); +EXPORT_SYMBOL(truncate); +EXPORT_SYMBOL(utime); +EXPORT_SYMBOL(chmod); +EXPORT_SYMBOL(rename); +EXPORT_SYMBOL(__xmknod); + +EXPORT_SYMBOL(symlink); +EXPORT_SYMBOL(link); +EXPORT_SYMBOL(unlink); +EXPORT_SYMBOL(readlink); + +EXPORT_SYMBOL(mkdir); +EXPORT_SYMBOL(rmdir); +EXPORT_SYMBOL(opendir); +EXPORT_SYMBOL(readdir); +EXPORT_SYMBOL(closedir); +EXPORT_SYMBOL(seekdir); +EXPORT_SYMBOL(telldir); + +EXPORT_SYMBOL(ioctl); + +extern ssize_t pread64 (int __fd, void *__buf, size_t __nbytes, + __off64_t __offset); +extern ssize_t pwrite64 (int __fd, __const void *__buf, size_t __n, + __off64_t __offset); +EXPORT_SYMBOL(pread64); +EXPORT_SYMBOL(pwrite64); + +EXPORT_SYMBOL(statfs); +EXPORT_SYMBOL(statfs64); + +EXPORT_SYMBOL(memcpy); +EXPORT_SYMBOL(getuid); + +EXPORT_SYMBOL(memset); +EXPORT_SYMBOL(strstr); + +EXPORT_SYMBOL(find_iomem); diff -Nru a/arch/um/kernel/user_util.c b/arch/um/kernel/user_util.c --- a/arch/um/kernel/user_util.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/kernel/user_util.c Fri Oct 31 14:10:53 2003 @@ -119,17 +119,6 @@ } } -int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags) -{ - int pid; - - pid = clone(fn, sp, flags, arg); - if(pid < 0) return(-1); - wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL); - ptrace(PTRACE_CONT, pid, 0, 0); - return(pid); -} - int raw(int fd, int complain) { struct termios tt; diff -Nru a/arch/um/kernel/user_util.c~uml-summa.diff b/arch/um/kernel/user_util.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/kernel/user_util.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,176 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <stdlib.h> +#include <fcntl.h> +#include <unistd.h> +#include <limits.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/ptrace.h> +#include <sys/utsname.h> +#include <sys/param.h> +#include <sys/time.h> +#include "asm/types.h" +#include <ctype.h> +#include <signal.h> +#include <wait.h> +#include <errno.h> +#include <stdarg.h> +#include <sched.h> +#include <termios.h> +#include <string.h> +#include "user_util.h" +#include "kern_util.h" +#include "user.h" +#include "mem_user.h" +#include "init.h" +#include "helper.h" +#include "uml-config.h" + +#define COMMAND_LINE_SIZE _POSIX_ARG_MAX + +/* Changed in linux_main and setup_arch, which run before SMP is started */ +char saved_command_line[COMMAND_LINE_SIZE] = { 0 }; +char command_line[COMMAND_LINE_SIZE] = { 0 }; + +void add_arg(char *cmd_line, char *arg) +{ + if (strlen(cmd_line) + strlen(arg) + 1 > COMMAND_LINE_SIZE) { + printf("add_arg: Too much command line!\n"); + exit(1); + } + if(strlen(cmd_line) > 0) strcat(cmd_line, " "); + strcat(cmd_line, arg); +} + +void stop(void) +{ + while(1) sleep(1000000); +} + +void stack_protections(unsigned long address) +{ + int prot = PROT_READ | PROT_WRITE | PROT_EXEC; + + if(mprotect((void *) address, page_size(), prot) < 0) + panic("protecting stack failed, errno = %d", errno); +} + +void task_protections(unsigned long address) +{ + unsigned long guard = address + page_size(); + unsigned long stack = guard + page_size(); + int prot = 0, pages; + +#ifdef notdef + if(mprotect((void *) stack, page_size(), prot) < 0) + panic("protecting guard page failed, errno = %d", errno); +#endif + pages = (1 << UML_CONFIG_KERNEL_STACK_ORDER) - 2; + prot = PROT_READ | PROT_WRITE | PROT_EXEC; + if(mprotect((void *) stack, pages * page_size(), prot) < 0) + panic("protecting stack failed, errno = %d", errno); +} + +int wait_for_stop(int pid, int sig, int cont_type, void *relay) +{ + sigset_t *relay_signals = relay; + int status, ret; + + while(1){ + if(((ret = waitpid(pid, &status, WUNTRACED)) < 0) || + !WIFSTOPPED(status) || (WSTOPSIG(status) != sig)){ + if(ret < 0){ + if(errno == EINTR) continue; + printk("wait failed, errno = %d\n", + errno); + } + else if(WIFEXITED(status)) + printk("process exited with status %d\n", + WEXITSTATUS(status)); + else if(WIFSIGNALED(status)) + printk("process exited with signal %d\n", + WTERMSIG(status)); + else if((WSTOPSIG(status) == SIGVTALRM) || + (WSTOPSIG(status) == SIGALRM) || + (WSTOPSIG(status) == SIGIO) || + (WSTOPSIG(status) == SIGPROF) || + (WSTOPSIG(status) == SIGCHLD) || + (WSTOPSIG(status) == SIGWINCH) || + (WSTOPSIG(status) == SIGINT)){ + ptrace(cont_type, pid, 0, WSTOPSIG(status)); + continue; + } + else if((relay_signals != NULL) && + sigismember(relay_signals, WSTOPSIG(status))){ + ptrace(cont_type, pid, 0, WSTOPSIG(status)); + continue; + } + else printk("process stopped with signal %d\n", + WSTOPSIG(status)); + panic("wait_for_stop failed to wait for %d to stop " + "with %d\n", pid, sig); + } + return(status); + } +} + +int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags) +{ + int pid; + + pid = clone(fn, sp, flags, arg); + if(pid < 0) return(-1); + wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL); + ptrace(PTRACE_CONT, pid, 0, 0); + return(pid); +} + +int raw(int fd, int complain) +{ + struct termios tt; + int err; + + tcgetattr(fd, &tt); + cfmakeraw(&tt); + err = tcsetattr(fd, TCSANOW, &tt); + if((err < 0) && complain){ + printk("tcsetattr failed, errno = %d\n", errno); + return(-errno); + } + return(0); +} + +void setup_machinename(char *machine_out) +{ + struct utsname host; + + uname(&host); + strcpy(machine_out, host.machine); +} + +char host_info[(_UTSNAME_LENGTH + 1) * 4 + _UTSNAME_NODENAME_LENGTH + 1]; + +void setup_hostinfo(void) +{ + struct utsname host; + + uname(&host); + sprintf(host_info, "%s %s %s %s %s", host.sysname, host.nodename, + host.release, host.version, host.machine); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c --- a/arch/um/os-Linux/drivers/tuntap_user.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/os-Linux/drivers/tuntap_user.c Fri Oct 31 14:10:54 2003 @@ -142,7 +142,7 @@ return(-errno); } memset(&ifr, 0, sizeof(ifr)); - ifr.ifr_flags = IFF_TAP; + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name)); if(ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0){ printk("TUNSETIFF failed, errno = %d", errno); diff -Nru a/arch/um/os-Linux/drivers/tuntap_user.c~uml-summa.diff b/arch/um/os-Linux/drivers/tuntap_user.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/os-Linux/drivers/tuntap_user.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/wait.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/uio.h> +#include <sys/ioctl.h> +#include <net/if.h> +#include <linux/if_tun.h> +#include "net_user.h" +#include "tuntap.h" +#include "kern_util.h" +#include "user.h" +#include "helper.h" +#include "os.h" + +#define MAX_PACKET ETH_MAX_PACKET + +void tuntap_user_init(void *data, void *dev) +{ + struct tuntap_data *pri = data; + + pri->dev = dev; +} + +static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask, + void *data) +{ + struct tuntap_data *pri = data; + + tap_check_ips(pri->gate_addr, addr); + if((pri->fd == -1) || pri->fixed_config) return; + open_addr(addr, netmask, pri->dev_name); +} + +static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask, + void *data) +{ + struct tuntap_data *pri = data; + + if((pri->fd == -1) || pri->fixed_config) return; + close_addr(addr, netmask, pri->dev_name); +} + +struct tuntap_pre_exec_data { + int stdout; + int close_me; +}; + +static void tuntap_pre_exec(void *arg) +{ + struct tuntap_pre_exec_data *data = arg; + + dup2(data->stdout, 1); + close(data->close_me); +} + +static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote, + char *buffer, int buffer_len, int *used_out) +{ + struct tuntap_pre_exec_data data; + char version_buf[sizeof("nnnnn\0")]; + char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate, + NULL }; + char buf[CMSG_SPACE(sizeof(*fd_out))]; + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + int pid, n; + + sprintf(version_buf, "%d", UML_NET_VERSION); + + data.stdout = remote; + data.close_me = me; + + pid = run_helper(tuntap_pre_exec, &data, argv, NULL); + + if(pid < 0) return(-pid); + + close(remote); + + msg.msg_name = NULL; + msg.msg_namelen = 0; + if(buffer != NULL){ + iov = ((struct iovec) { buffer, buffer_len }); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + } + else { + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + } + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + msg.msg_flags = 0; + n = recvmsg(me, &msg, 0); + *used_out = n; + if(n < 0){ + printk("tuntap_open_tramp : recvmsg failed - errno = %d\n", + errno); + return(errno); + } + waitpid(pid, NULL, 0); + + cmsg = CMSG_FIRSTHDR(&msg); + if(cmsg == NULL){ + printk("tuntap_open_tramp : didn't receive a message\n"); + return(EINVAL); + } + if((cmsg->cmsg_level != SOL_SOCKET) || + (cmsg->cmsg_type != SCM_RIGHTS)){ + printk("tuntap_open_tramp : didn't receive a descriptor\n"); + return(EINVAL); + } + *fd_out = ((int *) CMSG_DATA(cmsg))[0]; + return(0); +} + +static int tuntap_open(void *data) +{ + struct ifreq ifr; + struct tuntap_data *pri = data; + char *output, *buffer; + int err, fds[2], len, used; + + err = tap_open_common(pri->dev, pri->gate_addr); + if(err) return(err); + + if(pri->fixed_config){ + if((pri->fd = open("/dev/net/tun", O_RDWR)) < 0){ + printk("Failed to open /dev/net/tun, errno = %d\n", + errno); + return(-errno); + } + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_flags = IFF_TAP; + strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name)); + if(ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0){ + printk("TUNSETIFF failed, errno = %d", errno); + close(pri->fd); + return(-errno); + } + } + else { + err = os_pipe(fds, 0, 0); + if(err){ + printk("tuntap_open : os_pipe failed - errno = %d\n", + -err); + return(err); + } + + buffer = get_output_buffer(&len); + if(buffer != NULL) len--; + used = 0; + + err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0], + fds[1], buffer, len, &used); + + output = buffer; + if(err == 0){ + pri->dev_name = uml_strdup(buffer); + output += IFNAMSIZ; + printk(output); + free_output_buffer(buffer); + } + else { + printk(output); + free_output_buffer(buffer); + printk("tuntap_open_tramp failed - errno = %d\n", err); + return(-err); + } + close(fds[0]); + iter_addresses(pri->dev, open_addr, pri->dev_name); + } + + return(pri->fd); +} + +static void tuntap_close(int fd, void *data) +{ + struct tuntap_data *pri = data; + + if(!pri->fixed_config) + iter_addresses(pri->dev, close_addr, pri->dev_name); + close(fd); + pri->fd = -1; +} + +static int tuntap_set_mtu(int mtu, void *data) +{ + return(mtu); +} + +struct net_user_info tuntap_user_info = { + .init = tuntap_user_init, + .open = tuntap_open, + .close = tuntap_close, + .remove = NULL, + .set_mtu = tuntap_set_mtu, + .add_address = tuntap_add_addr, + .delete_address = tuntap_del_addr, + .max_packet = MAX_PACKET +}; + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c --- a/arch/um/os-Linux/file.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/os-Linux/file.c Fri Oct 31 14:10:54 2003 @@ -315,7 +315,7 @@ return(new); } -int create_unix_socket(char *file, int len) +int create_unix_socket(char *file, int len, int close_on_exec) { struct sockaddr_un addr; int sock, err; @@ -327,6 +327,10 @@ return(-errno); } + if(close_on_exec && fcntl(sock, F_SETFD, 1) < 0) + printk("create_unix_socket : Setting FD_CLOEXEC failed, " + "errno = %d", errno); + addr.sun_family = AF_UNIX; /* XXX Be more careful about overflow */ @@ -340,6 +344,37 @@ } return(sock); +} + +void os_flush_stdout(void) +{ + fflush(stdout); +} + +int os_lock_file(int fd, int excl) +{ + int type = excl ? F_WRLCK : F_RDLCK; + struct flock lock = ((struct flock) { .l_type = type, + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0 } ); + int err, save; + + err = fcntl(fd, F_SETLK, &lock); + if(!err) + goto out; + + save = -errno; + err = fcntl(fd, F_GETLK, &lock); + if(err){ + err = -errno; + goto out; + } + + printk("F_SETLK failed, file already locked by pid %d\n", lock.l_pid); + err = save; + out: + return(err); } /* diff -Nru a/arch/um/os-Linux/file.c~uml-summa.diff b/arch/um/os-Linux/file.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/os-Linux/file.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <stdio.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <signal.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/ioctl.h> +#include <sys/mount.h> +#include <sys/uio.h> +#include "os.h" +#include "user.h" +#include "kern_util.h" + +int os_file_type(char *file) +{ + struct stat64 buf; + + if(stat64(file, &buf) == -1) + return(-errno); + + if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR); + else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK); + else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV); + else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV); + else if(S_ISFIFO(buf.st_mode)) return(OS_TYPE_FIFO); + else if(S_ISSOCK(buf.st_mode)) return(OS_TYPE_SOCK); + else return(OS_TYPE_FILE); +} + +int os_file_mode(char *file, struct openflags *mode_out) +{ + *mode_out = OPENFLAGS(); + + if(!access(file, W_OK)) *mode_out = of_write(*mode_out); + else if(errno != EACCES) + return(-errno); + + if(!access(file, R_OK)) *mode_out = of_read(*mode_out); + else if(errno != EACCES) + return(-errno); + + return(0); +} + +int os_open_file(char *file, struct openflags flags, int mode) +{ + int fd, f = 0; + + if(flags.r && flags.w) f = O_RDWR; + else if(flags.r) f = O_RDONLY; + else if(flags.w) f = O_WRONLY; + else f = 0; + + if(flags.s) f |= O_SYNC; + if(flags.c) f |= O_CREAT; + if(flags.t) f |= O_TRUNC; + if(flags.e) f |= O_EXCL; + + fd = open64(file, f, mode); + if(fd < 0) return(-errno); + + if(flags.cl){ + if(fcntl(fd, F_SETFD, 1)){ + close(fd); + return(-errno); + } + } + + return(fd); + return(fd); +} + +int os_connect_socket(char *name) +{ + struct sockaddr_un sock; + int fd, err; + + sock.sun_family = AF_UNIX; + snprintf(sock.sun_path, sizeof(sock.sun_path), "%s", name); + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if(fd < 0) + return(fd); + + err = connect(fd, (struct sockaddr *) &sock, sizeof(sock)); + if(err) + return(err); + + return(fd); +} + +void os_close_file(int fd) +{ + close(fd); +} + +int os_seek_file(int fd, __u64 offset) +{ + __u64 actual; + + actual = lseek64(fd, offset, SEEK_SET); + if(actual != offset) return(-errno); + return(0); +} + +int os_read_file(int fd, void *buf, int len) +{ + int n; + + /* Force buf into memory if it's not already. */ + + /* XXX This fails if buf is kernel memory */ +#ifdef notdef + if(copy_to_user_proc(buf, &c, sizeof(c))) + return(-EFAULT); +#endif + + n = read(fd, buf, len); + if(n < 0) + return(-errno); + return(n); +} + +int os_write_file(int fd, void *buf, int count) +{ + int n; + + /* Force buf into memory if it's not already. */ + + /* XXX This fails if buf is kernel memory */ +#ifdef notdef + if(copy_to_user_proc(buf, buf, buf[0])) + return(-EFAULT); +#endif + + n = write(fd, buf, count); + if(n < 0) + return(-errno); + return(n); +} + +int os_file_size(char *file, long long *size_out) +{ + struct stat64 buf; + + if(stat64(file, &buf) == -1){ + printk("Couldn't stat \"%s\" : errno = %d\n", file, errno); + return(-errno); + } + if(S_ISBLK(buf.st_mode)){ + int fd, blocks; + + if((fd = open64(file, O_RDONLY)) < 0){ + printk("Couldn't open \"%s\", errno = %d\n", file, + errno); + return(-errno); + } + if(ioctl(fd, BLKGETSIZE, &blocks) < 0){ + printk("Couldn't get the block size of \"%s\", " + "errno = %d\n", file, errno); + close(fd); + return(-errno); + } + *size_out = ((long long) blocks) * 512; + close(fd); + return(0); + } + *size_out = buf.st_size; + return(0); +} + +int os_pipe(int *fds, int stream, int close_on_exec) +{ + int err, type = stream ? SOCK_STREAM : SOCK_DGRAM; + + err = socketpair(AF_UNIX, type, 0, fds); + if(err) + return(-errno); + + if(!close_on_exec) + return(0); + + if((fcntl(fds[0], F_SETFD, 1) < 0) || (fcntl(fds[1], F_SETFD, 1) < 0)) + printk("os_pipe : Setting FD_CLOEXEC failed, errno = %d", + errno); + + return(0); +} + +int os_set_fd_async(int fd, int owner) +{ + /* XXX This should do F_GETFL first */ + if(fcntl(fd, F_SETFL, O_ASYNC | O_NONBLOCK) < 0){ + printk("os_set_fd_async : failed to set O_ASYNC and " + "O_NONBLOCK on fd # %d, errno = %d\n", fd, errno); + return(-errno); + } +#ifdef notdef + if(fcntl(fd, F_SETFD, 1) < 0){ + printk("os_set_fd_async : Setting FD_CLOEXEC failed, " + "errno = %d\n", errno); + } +#endif + + if((fcntl(fd, F_SETSIG, SIGIO) < 0) || + (fcntl(fd, F_SETOWN, owner) < 0)){ + printk("os_set_fd_async : Failed to fcntl F_SETOWN " + "(or F_SETSIG) fd %d to pid %d, errno = %d\n", fd, + owner, errno); + return(-errno); + } + + return(0); +} + +int os_set_fd_block(int fd, int blocking) +{ + int flags; + + flags = fcntl(fd, F_GETFL); + + if(blocking) flags &= ~O_NONBLOCK; + else flags |= O_NONBLOCK; + + if(fcntl(fd, F_SETFL, flags) < 0){ + printk("Failed to change blocking on fd # %d, errno = %d\n", + fd, errno); + return(-errno); + } + return(0); +} + +int os_accept_connection(int fd) +{ + int new; + + new = accept(fd, NULL, 0); + if(new < 0) + return(-errno); + return(new); +} + +#ifndef SHUT_RD +#define SHUT_RD 0 +#endif + +#ifndef SHUT_WR +#define SHUT_WR 1 +#endif + +#ifndef SHUT_RDWR +#define SHUT_RDWR 2 +#endif + +int os_shutdown_socket(int fd, int r, int w) +{ + int what, err; + + if(r && w) what = SHUT_RDWR; + else if(r) what = SHUT_RD; + else if(w) what = SHUT_WR; + else { + printk("os_shutdown_socket : neither r or w was set\n"); + return(-EINVAL); + } + err = shutdown(fd, what); + if(err) + return(-errno); + return(0); +} + +int os_rcv_fd(int fd, int *helper_pid_out) +{ + int new, n; + char buf[CMSG_SPACE(sizeof(new))]; + struct msghdr msg; + struct cmsghdr *cmsg; + struct iovec iov; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + iov = ((struct iovec) { .iov_base = helper_pid_out, + .iov_len = sizeof(*helper_pid_out) }); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + msg.msg_flags = 0; + + n = recvmsg(fd, &msg, 0); + if(n < 0) + return(-errno); + + else if(n != sizeof(iov.iov_len)) + *helper_pid_out = -1; + + cmsg = CMSG_FIRSTHDR(&msg); + if(cmsg == NULL){ + printk("rcv_fd didn't receive anything, error = %d\n", errno); + return(-1); + } + if((cmsg->cmsg_level != SOL_SOCKET) || + (cmsg->cmsg_type != SCM_RIGHTS)){ + printk("rcv_fd didn't receive a descriptor\n"); + return(-1); + } + + new = ((int *) CMSG_DATA(cmsg))[0]; + return(new); +} + +int create_unix_socket(char *file, int len) +{ + struct sockaddr_un addr; + int sock, err; + + sock = socket(PF_UNIX, SOCK_DGRAM, 0); + if (sock < 0){ + printk("create_unix_socket - socket failed, errno = %d\n", + errno); + return(-errno); + } + + addr.sun_family = AF_UNIX; + + /* XXX Be more careful about overflow */ + snprintf(addr.sun_path, len, "%s", file); + + err = bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + if (err < 0){ + printk("create_listening_socket - bind failed, errno = %d\n", + errno); + return(-errno); + } + + return(sock); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/sys-i386/Makefile b/arch/um/sys-i386/Makefile --- a/arch/um/sys-i386/Makefile Fri Oct 31 14:10:53 2003 +++ b/arch/um/sys-i386/Makefile Fri Oct 31 14:10:53 2003 @@ -1,7 +1,8 @@ -obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o module.o \ - ptrace.o ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o +obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o ptrace.o \ + ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o obj-$(CONFIG_HIGHMEM) += highmem.o +obj-$(CONFIG_MODULES) += module.o USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) @@ -9,6 +10,8 @@ SYMLINKS = semaphore.c highmem.c module.c SYMLINKS := $(foreach f,$(SYMLINKS),$(src)/$f) +clean-files := $(SYMLINKS) + semaphore.c-dir = kernel highmem.c-dir = mm module.c-dir = kernel @@ -24,8 +27,7 @@ $(SYMLINKS): $(call make_link,$@) -clean: - $(MAKE) -C util clean +subdir- := util fastdep: diff -Nru a/arch/um/sys-i386/Makefile~uml-summa.diff b/arch/um/sys-i386/Makefile~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/sys-i386/Makefile~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,42 @@ +obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o module.o \ + ptrace.o ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o + +obj-$(CONFIG_HIGHMEM) += highmem.o + +USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o +USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) + +SYMLINKS = semaphore.c highmem.c module.c +SYMLINKS := $(foreach f,$(SYMLINKS),$(src)/$f) + +semaphore.c-dir = kernel +highmem.c-dir = mm +module.c-dir = kernel + +define make_link + -rm -f $1 + ln -sf $(TOPDIR)/arch/i386/$($(notdir $1)-dir)/$(notdir $1) $1 +endef + +$(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< + +$(SYMLINKS): + $(call make_link,$@) + +clean: + $(MAKE) -C util clean + +fastdep: + +dep: + +archmrproper: + rm -f $(SYMLINKS) + +archclean: + +archdep: + +modules: + diff -Nru a/arch/um/sys-i386/bugs.c b/arch/um/sys-i386/bugs.c --- a/arch/um/sys-i386/bugs.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/sys-i386/bugs.c Fri Oct 31 14:10:54 2003 @@ -8,6 +8,7 @@ #include <errno.h> #include <string.h> #include <sys/signal.h> +#include <asm/ldt.h> #include "kern_util.h" #include "user.h" #include "sysdep/ptrace.h" @@ -16,8 +17,8 @@ #define MAXTOKEN 64 /* Set during early boot */ -int cpu_has_cmov = 1; -int cpu_has_xmm = 0; +int host_has_cmov = 1; +int host_has_xmm = 0; static char token(int fd, char *buf, int len, char stop) { @@ -104,6 +105,25 @@ return(1); } +static void disable_lcall(void) +{ + struct modify_ldt_ldt_s ldt; + int err; + + bzero(&ldt, sizeof(ldt)); + ldt.entry_number = 7; + ldt.base_addr = 0; + ldt.limit = 0; + err = modify_ldt(1, &ldt, sizeof(ldt)); + if(err) + printk("Failed to disable lcall7 - errno = %d\n", errno); +} + +void arch_init_thread(void) +{ + disable_lcall(); +} + void arch_check_bugs(void) { int have_it; @@ -113,8 +133,8 @@ "checks\n"); return; } - if(check_cpu_feature("cmov", &have_it)) cpu_has_cmov = have_it; - if(check_cpu_feature("xmm", &have_it)) cpu_has_xmm = have_it; + if(check_cpu_feature("cmov", &have_it)) host_has_cmov = have_it; + if(check_cpu_feature("xmm", &have_it)) host_has_xmm = have_it; } int arch_handle_signal(int sig, union uml_pt_regs *regs) @@ -130,18 +150,18 @@ if((*((char *) ip) != 0x0f) || ((*((char *) (ip + 1)) & 0xf0) != 0x40)) return(0); - if(cpu_has_cmov == 0) + if(host_has_cmov == 0) panic("SIGILL caused by cmov, which this processor doesn't " "implement, boot a filesystem compiled for older " "processors"); - else if(cpu_has_cmov == 1) + else if(host_has_cmov == 1) panic("SIGILL caused by cmov, which this processor claims to " "implement"); - else if(cpu_has_cmov == -1) + else if(host_has_cmov == -1) panic("SIGILL caused by cmov, couldn't tell if this processor " "implements it, boot a filesystem compiled for older " "processors"); - else panic("Bad value for cpu_has_cmov (%d)", cpu_has_cmov); + else panic("Bad value for host_has_cmov (%d)", host_has_cmov); return(0); } diff -Nru a/arch/um/sys-i386/bugs.c~uml-summa.diff b/arch/um/sys-i386/bugs.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/sys-i386/bugs.c~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> +#include <string.h> +#include <sys/signal.h> +#include "kern_util.h" +#include "user.h" +#include "sysdep/ptrace.h" +#include "task.h" + +#define MAXTOKEN 64 + +/* Set during early boot */ +int cpu_has_cmov = 1; +int cpu_has_xmm = 0; + +static char token(int fd, char *buf, int len, char stop) +{ + int n; + char *ptr, *end, c; + + ptr = buf; + end = &buf[len]; + do { + n = read(fd, ptr, sizeof(*ptr)); + c = *ptr++; + if(n == 0) return(0); + else if(n != sizeof(*ptr)){ + printk("Reading /proc/cpuinfo failed, " + "errno = %d\n", errno); + return(-errno); + } + } while((c != '\n') && (c != stop) && (ptr < end)); + + if(ptr == end){ + printk("Failed to find '%c' in /proc/cpuinfo\n", stop); + return(-1); + } + *(ptr - 1) = '\0'; + return(c); +} + +static int check_cpu_feature(char *feature, int *have_it) +{ + char buf[MAXTOKEN], c; + int fd, len = sizeof(buf)/sizeof(buf[0]), n; + + printk("Checking for host processor %s support...", feature); + fd = open("/proc/cpuinfo", O_RDONLY); + if(fd < 0){ + printk("Couldn't open /proc/cpuinfo, errno = %d\n", errno); + return(0); + } + + *have_it = 0; + buf[len - 1] = '\0'; + while(1){ + c = token(fd, buf, len - 1, ':'); + if(c <= 0) goto out; + else if(c != ':'){ + printk("Failed to find ':' in /proc/cpuinfo\n"); + goto out; + } + + if(!strncmp(buf, "flags", strlen("flags"))) break; + + do { + n = read(fd, &c, sizeof(c)); + if(n != sizeof(c)){ + printk("Failed to find newline in " + "/proc/cpuinfo, n = %d, errno = %d\n", + n, errno); + goto out; + } + } while(c != '\n'); + } + + c = token(fd, buf, len - 1, ' '); + if(c < 0) goto out; + else if(c != ' '){ + printk("Failed to find ':' in /proc/cpuinfo\n"); + goto out; + } + + while(1){ + c = token(fd, buf, len - 1, ' '); + if(c < 0) goto out; + else if(c == '\n') break; + + if(!strcmp(buf, feature)){ + *have_it = 1; + goto out; + } + } + out: + if(*have_it == 0) printk("No\n"); + else if(*have_it == 1) printk("Yes\n"); + close(fd); + return(1); +} + +void arch_check_bugs(void) +{ + int have_it; + + if(access("/proc/cpuinfo", R_OK)){ + printk("/proc/cpuinfo not available - skipping CPU capability " + "checks\n"); + return; + } + if(check_cpu_feature("cmov", &have_it)) cpu_has_cmov = have_it; + if(check_cpu_feature("xmm", &have_it)) cpu_has_xmm = have_it; +} + +int arch_handle_signal(int sig, union uml_pt_regs *regs) +{ + unsigned long ip; + + /* This is testing for a cmov (0x0f 0x4x) instruction causing a + * SIGILL in init. + */ + if((sig != SIGILL) || (TASK_PID(get_current()) != 1)) return(0); + + ip = UPT_IP(regs); + if((*((char *) ip) != 0x0f) || ((*((char *) (ip + 1)) & 0xf0) != 0x40)) + return(0); + + if(cpu_has_cmov == 0) + panic("SIGILL caused by cmov, which this processor doesn't " + "implement, boot a filesystem compiled for older " + "processors"); + else if(cpu_has_cmov == 1) + panic("SIGILL caused by cmov, which this processor claims to " + "implement"); + else if(cpu_has_cmov == -1) + panic("SIGILL caused by cmov, couldn't tell if this processor " + "implements it, boot a filesystem compiled for older " + "processors"); + else panic("Bad value for cpu_has_cmov (%d)", cpu_has_cmov); + return(0); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/sys-i386/fault.c b/arch/um/sys-i386/fault.c --- a/arch/um/sys-i386/fault.c Fri Oct 31 14:10:54 2003 +++ b/arch/um/sys-i386/fault.c Fri Oct 31 14:10:54 2003 @@ -7,14 +7,24 @@ #include "sysdep/ptrace.h" #include "sysdep/sigcontext.h" -extern unsigned long search_exception_table(unsigned long addr); +struct exception_table_entry +{ + unsigned long insn; + unsigned long fixup; +}; +const struct exception_table_entry *search_exception_tables(unsigned long add); int arch_fixup(unsigned long address, void *sc_ptr) { struct sigcontext *sc = sc_ptr; - unsigned long fixup; + long fixup; + const struct exception_table_entry *ete; - fixup = search_exception_tables(address); + ete = search_exception_tables(address); + if (!ete) + return 0; + + fixup = ete->fixup; if(fixup != 0){ sc->eip = fixup; return(1); diff -Nru a/arch/um/sys-i386/fault.c~uml-summa.diff b/arch/um/sys-i386/fault.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/sys-i386/fault.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <signal.h> +#include "sysdep/ptrace.h" +#include "sysdep/sigcontext.h" + +extern unsigned long search_exception_table(unsigned long addr); + +int arch_fixup(unsigned long address, void *sc_ptr) +{ + struct sigcontext *sc = sc_ptr; + unsigned long fixup; + + fixup = search_exception_tables(address); + if(fixup != 0){ + sc->eip = fixup; + return(1); + } + return(0); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/arch/um/uml.lds.S b/arch/um/uml.lds.S --- a/arch/um/uml.lds.S Fri Oct 31 14:10:53 2003 +++ b/arch/um/uml.lds.S Fri Oct 31 14:10:53 2003 @@ -26,7 +26,11 @@ . = ALIGN(4096); /* Init code and data */ _stext = .; __init_begin = .; - .text.init : { *(.text.init) } + .init.text : { + _sinittext = .; + *(.init.text) + _einittext = .; + } . = ALIGN(4096); .text : { @@ -38,7 +42,7 @@ #include "asm/common.lds.S" - .data.init : { *(.data.init) } + init.data : { *(init.data) } .data : { . = ALIGN(KERNEL_STACK_SIZE); /* init_task */ diff -Nru a/arch/um/uml.lds.S~uml-summa.diff b/arch/um/uml.lds.S~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/uml.lds.S~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,92 @@ +#include <asm-generic/vmlinux.lds.h> + +OUTPUT_FORMAT(ELF_FORMAT) +OUTPUT_ARCH(ELF_ARCH) +ENTRY(_start) +jiffies = jiffies_64; + +SECTIONS +{ + . = START + SIZEOF_HEADERS; + + . = ALIGN(4096); + __binary_start = .; +#ifdef MODE_TT + .thread_private : { + __start_thread_private = .; + errno = .; + . += 4; + arch/um/kernel/tt/unmap_fin.o (.data) + __end_thread_private = .; + } + . = ALIGN(4096); + .remap : { arch/um/kernel/tt/unmap_fin.o (.text) } +#endif + + . = ALIGN(4096); /* Init code and data */ + _stext = .; + __init_begin = .; + .text.init : { *(.text.init) } + . = ALIGN(4096); + .text : + { + *(.text) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.gnu.linkonce.t*) + } + + #include "asm/common.lds.S" + + .data.init : { *(.data.init) } + .data : + { + . = ALIGN(KERNEL_STACK_SIZE); /* init_task */ + *(.data.init_task) + *(.data) + *(.gnu.linkonce.d*) + CONSTRUCTORS + } + .data1 : { *(.data1) } + .ctors : + { + *(.ctors) + } + .dtors : + { + *(.dtors) + } + + .got : { *(.got.plt) *(.got) } + .dynamic : { *(.dynamic) } + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : { *(.sdata) } + _edata = .; + PROVIDE (edata = .); + . = ALIGN(0x1000); + .sbss : + { + __bss_start = .; + PROVIDE(_bss_start = .); + *(.sbss) + *(.scommon) + } + .bss : + { + *(.dynbss) + *(.bss) + *(COMMON) + } + _end = . ; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } +} diff -Nru a/arch/um/util/mk_constants_kern.c b/arch/um/util/mk_constants_kern.c --- a/arch/um/util/mk_constants_kern.c Fri Oct 31 14:10:53 2003 +++ b/arch/um/util/mk_constants_kern.c Fri Oct 31 14:10:53 2003 @@ -1,5 +1,6 @@ #include "linux/kernel.h" #include "linux/stringify.h" +#include "linux/time.h" #include "asm/page.h" extern void print_head(void); @@ -11,6 +12,7 @@ { print_head(); print_constant_int("UM_KERN_PAGE_SIZE", PAGE_SIZE); + print_constant_str("UM_KERN_EMERG", KERN_EMERG); print_constant_str("UM_KERN_ALERT", KERN_ALERT); print_constant_str("UM_KERN_CRIT", KERN_CRIT); @@ -19,6 +21,8 @@ print_constant_str("UM_KERN_NOTICE", KERN_NOTICE); print_constant_str("UM_KERN_INFO", KERN_INFO); print_constant_str("UM_KERN_DEBUG", KERN_DEBUG); + + print_constant_int("UM_NSEC_PER_SEC", NSEC_PER_SEC); print_tail(); return(0); } diff -Nru a/arch/um/util/mk_constants_kern.c~uml-summa.diff b/arch/um/util/mk_constants_kern.c~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/arch/um/util/mk_constants_kern.c~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,24 @@ +#include "linux/kernel.h" +#include "linux/stringify.h" +#include "asm/page.h" + +extern void print_head(void); +extern void print_constant_str(char *name, char *value); +extern void print_constant_int(char *name, int value); +extern void print_tail(void); + +int main(int argc, char **argv) +{ + print_head(); + print_constant_int("UM_KERN_PAGE_SIZE", PAGE_SIZE); + print_constant_str("UM_KERN_EMERG", KERN_EMERG); + print_constant_str("UM_KERN_ALERT", KERN_ALERT); + print_constant_str("UM_KERN_CRIT", KERN_CRIT); + print_constant_str("UM_KERN_ERR", KERN_ERR); + print_constant_str("UM_KERN_WARNING", KERN_WARNING); + print_constant_str("UM_KERN_NOTICE", KERN_NOTICE); + print_constant_str("UM_KERN_INFO", KERN_INFO); + print_constant_str("UM_KERN_DEBUG", KERN_DEBUG); + print_tail(); + return(0); +} diff -Nru a/arch/x86_64/kernel/acpi/boot.c b/arch/x86_64/kernel/acpi/boot.c --- a/arch/x86_64/kernel/acpi/boot.c Fri Oct 31 14:10:53 2003 +++ b/arch/x86_64/kernel/acpi/boot.c Fri Oct 31 14:10:53 2003 @@ -251,6 +251,33 @@ } #endif +#ifdef CONFIG_ACPI_BUS +/* + * Set specified PIC IRQ to level triggered mode. + * + * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers + * for the 8259 PIC. bit[n] = 1 means irq[n] is Level, otherwise Edge. + * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0) + * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0) + * + * As the BIOS should have done this for us, + * print a warning if the IRQ wasn't already set to level. + */ + +void acpi_pic_set_level_irq(unsigned int irq) +{ + unsigned char mask = 1 << (irq & 7); + unsigned int port = 0x4d0 + (irq >> 3); + unsigned char val = inb(port); + + if (!(val & mask)) { + printk(KERN_WARNING PREFIX "IRQ %d was Edge Triggered, " + "setting to Level Triggerd\n", irq); + outb(val | mask, port); + } +} +#endif /* CONFIG_ACPI_BUS */ + static unsigned long __init acpi_scan_rsdp ( unsigned long start, diff -Nru a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S --- a/arch/x86_64/kernel/entry.S Fri Oct 31 14:10:53 2003 +++ b/arch/x86_64/kernel/entry.S Fri Oct 31 14:10:53 2003 @@ -566,8 +566,14 @@ incl %ebx /* There are two places in the kernel that can potentially fault with usergs. Handle them here. The exception handlers after - iret run with kernel gs again, so don't set the user space flag. */ - cmpq $iret_label,RIP(%rsp) + iret run with kernel gs again, so don't set the user space flag. + B stepping K8s sometimes report an truncated RIP for IRET + exceptions returning to compat mode. Check for these here too. */ + leaq iret_label(%rip),%rbp + cmpq %rbp,RIP(%rsp) + je error_swapgs + movl %ebp,%ebp /* zero extend */ + cmpq %rbp,RIP(%rsp) je error_swapgs cmpq $gs_change,RIP(%rsp) je error_swapgs diff -Nru a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c --- a/arch/x86_64/kernel/io_apic.c Fri Oct 31 14:10:54 2003 +++ b/arch/x86_64/kernel/io_apic.c Fri Oct 31 14:10:54 2003 @@ -622,11 +622,13 @@ return 0; } -int irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 }; +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 }; static int __init assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; + BUG_ON(irq >= NR_IRQ_VECTORS); if (IO_APIC_VECTOR(irq) > 0) return IO_APIC_VECTOR(irq); next: diff -Nru a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c --- a/arch/x86_64/kernel/setup64.c Fri Oct 31 14:10:53 2003 +++ b/arch/x86_64/kernel/setup64.c Fri Oct 31 14:10:53 2003 @@ -189,8 +189,7 @@ pda->irqstackptr += IRQSTACKSIZE-64; } -#define EXCEPTION_STK_ORDER 0 /* >= N_EXCEPTION_STACKS*EXCEPTION_STKSZ */ -char boot_exception_stacks[N_EXCEPTION_STACKS*EXCEPTION_STKSZ]; +char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ]; void syscall_init(void) { @@ -226,15 +225,12 @@ #endif struct tss_struct * t = &init_tss[cpu]; unsigned long v, efer; - char *estacks; + char *estacks = NULL; struct task_struct *me; /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); - estacks = (char *)__get_free_pages(GFP_ATOMIC, 0); - if (!estacks) - panic("Can't allocate exception stacks for CPU %d\n",cpu); } else estacks = boot_exception_stacks; @@ -282,10 +278,15 @@ /* * set up and load the per-CPU TSS */ - estacks += EXCEPTION_STKSZ; for (v = 0; v < N_EXCEPTION_STACKS; v++) { - t->ist[v] = (unsigned long)estacks; + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, 0); + if (!estacks) + panic("Cannot allocate exception stack %ld %d\n", + v, cpu); + } estacks += EXCEPTION_STKSZ; + t->ist[v] = (unsigned long)estacks; } t->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; diff -Nru a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c --- a/arch/x86_64/kernel/smp.c Fri Oct 31 14:10:53 2003 +++ b/arch/x86_64/kernel/smp.c Fri Oct 31 14:10:53 2003 @@ -487,25 +487,3 @@ atomic_inc(&call_data->finished); } } - -/* Slow. Should be only used for debugging. */ -int slow_smp_processor_id(void) -{ - int stack_location; - unsigned long sp = (unsigned long)&stack_location; - int offset = 0, cpu; - - for (offset = 0; next_cpu(offset, cpu_online_map) < NR_CPUS; offset = cpu + 1) { - cpu = next_cpu(offset, cpu_online_map); - - if (sp >= (u64)cpu_pda[cpu].irqstackptr - IRQSTACKSIZE && - sp <= (u64)cpu_pda[cpu].irqstackptr) - return cpu; - - unsigned long estack = init_tss[cpu].ist[0] - EXCEPTION_STKSZ; - if (sp >= estack && sp <= estack+(1<<(PAGE_SHIFT+EXCEPTION_STK_ORDER))) - return cpu; - } - - return stack_smp_processor_id(); -} diff -Nru a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c --- a/arch/x86_64/kernel/time.c Fri Oct 31 14:10:54 2003 +++ b/arch/x86_64/kernel/time.c Fri Oct 31 14:10:54 2003 @@ -111,6 +111,14 @@ sec = xtime.tv_sec; usec = xtime.tv_nsec / 1000; + /* + * If time_adjust is negative then NTP is slowing the clock + * so make sure not to go into next possible interval. + * Better to lose some accuracy than have time go backwards.. + */ + if (unlikely(time_adjust < 0) && usec > tickadj) + usec = tickadj; + t = (jiffies - wall_jiffies) * (1000000L / HZ) + do_gettimeoffset(); usec += t; @@ -477,22 +485,28 @@ static unsigned int ref_freq = 0; static unsigned long loops_per_jiffy_ref = 0; -//static unsigned long fast_gettimeoffset_ref = 0; static unsigned long cpu_khz_ref = 0; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; + unsigned long *lpj; + +#ifdef CONFIG_SMP + lpj = &cpu_data[freq->cpu].loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif if (!ref_freq) { ref_freq = freq->old; - loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy; + loops_per_jiffy_ref = *lpj; cpu_khz_ref = cpu_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { - cpu_data[freq->cpu].loops_per_jiffy = + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); diff -Nru a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c --- a/arch/x86_64/kernel/x8664_ksyms.c Fri Oct 31 14:10:53 2003 +++ b/arch/x86_64/kernel/x8664_ksyms.c Fri Oct 31 14:10:53 2003 @@ -71,6 +71,7 @@ EXPORT_SYMBOL_NOVERS(__up_wakeup); /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_nocheck); +EXPORT_SYMBOL(ip_compute_csum); /* Delay loops */ EXPORT_SYMBOL(__udelay); EXPORT_SYMBOL(__ndelay); @@ -113,6 +114,7 @@ EXPORT_SYMBOL(mmx_copy_page); #endif +EXPORT_SYMBOL(cpu_pda); #ifdef CONFIG_SMP EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(cpu_online_map); @@ -153,7 +155,7 @@ extern void * memset(void *,int,__kernel_size_t); extern size_t strlen(const char *); -extern char * bcopy(const char * src, char * dest, int count); +extern void bcopy(const char * src, char * dest, int count); extern void * memmove(void * dest,const void *src,size_t count); extern char * strcpy(char * dest,const char *src); extern int strcmp(const char * cs,const char * ct); diff -Nru a/arch/x86_64/mm/extable.c b/arch/x86_64/mm/extable.c --- a/arch/x86_64/mm/extable.c Fri Oct 31 14:10:54 2003 +++ b/arch/x86_64/mm/extable.c Fri Oct 31 14:10:54 2003 @@ -14,6 +14,10 @@ const struct exception_table_entry *last, unsigned long value) { + /* Work around a B stepping K8 bug */ + if ((value >> 32) == 0) + value |= 0xffffffffUL << 32; + while (first <= last) { const struct exception_table_entry *mid; long diff; diff -Nru a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c --- a/arch/x86_64/mm/k8topology.c Fri Oct 31 14:10:54 2003 +++ b/arch/x86_64/mm/k8topology.c Fri Oct 31 14:10:54 2003 @@ -164,5 +164,8 @@ rr++; } + if (found == 1) + fake_node = 1; + return 0; } diff -Nru a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c --- a/arch/x86_64/mm/numa.c Fri Oct 31 14:10:54 2003 +++ b/arch/x86_64/mm/numa.c Fri Oct 31 14:10:54 2003 @@ -104,6 +104,7 @@ if (nodeid + 1 > numnodes) numnodes = nodeid + 1; nodes_present |= (1UL << nodeid); + node_set_online(nodeid); } /* Initialize final allocator for a zone */ diff -Nru a/crypto/api.c b/crypto/api.c --- a/crypto/api.c Fri Oct 31 14:10:54 2003 +++ b/crypto/api.c Fri Oct 31 14:10:54 2003 @@ -36,6 +36,9 @@ struct crypto_alg *crypto_alg_lookup(const char *name) { struct crypto_alg *q, *alg = NULL; + + if (!name) + return NULL; down_read(&crypto_alg_sem); diff -Nru a/drivers/acpi/dispatcher/dsopcode.c b/drivers/acpi/dispatcher/dsopcode.c --- a/drivers/acpi/dispatcher/dsopcode.c Fri Oct 31 14:10:54 2003 +++ b/drivers/acpi/dispatcher/dsopcode.c Fri Oct 31 14:10:54 2003 @@ -514,16 +514,14 @@ goto cleanup; } + /* Entire field must fit within the current length of the buffer */ if ((bit_offset + bit_count) > (8 * (u32) buffer_desc->buffer.length)) { ACPI_DEBUG_PRINT ((ACPI_DB_ERROR, - "Field [%4.4s] size %d exceeds Buffer [%4.4s] size %d (bits)\n", - ((struct acpi_namespace_node *) result_desc)->name.ascii, - bit_offset + bit_count, - buffer_desc->buffer.node->name.ascii, - 8 * (u32) buffer_desc->buffer.length)); + "Field size %d exceeds Buffer size %d (bits)\n", + bit_offset + bit_count, 8 * (u32) buffer_desc->buffer.length)); status = AE_AML_BUFFER_LIMIT; goto cleanup; } diff -Nru a/drivers/acpi/ec.c b/drivers/acpi/ec.c --- a/drivers/acpi/ec.c Fri Oct 31 14:10:54 2003 +++ b/drivers/acpi/ec.c Fri Oct 31 14:10:54 2003 @@ -94,13 +94,6 @@ /* External interfaces use first EC only, so remember */ static struct acpi_device *first_ec; -/* - * We use kernel thread to handle ec's gpe query, so the query may defer. - * The query need a context, which can be freed when we replace ec_ecdt - * with EC device. So defered query may have a wrong context. - * We use an indication to avoid it - */ -static int ec_device_init = 0; /* -------------------------------------------------------------------------- Transaction Management -------------------------------------------------------------------------- */ @@ -400,11 +393,8 @@ acpi_disable_gpe(NULL, ec->gpe_bit, ACPI_ISR); - if (!ec_device_init) - acpi_ec_gpe_query(ec); /* directly query when device didn't init */ - else - status = acpi_os_queue_for_execution(OSD_PRIORITY_GPE, - acpi_ec_gpe_query, ec); + status = acpi_os_queue_for_execution(OSD_PRIORITY_GPE, + acpi_ec_gpe_query, ec); } /* -------------------------------------------------------------------------- @@ -599,8 +589,6 @@ we now have the *real* EC info, so kill the makeshift one.*/ acpi_evaluate_integer(ec->handle, "_UID", NULL, &uid); if (ec_ecdt && ec_ecdt->uid == uid) { - acpi_disable_gpe(NULL, ec_ecdt->gpe_bit, ACPI_NOT_ISR); - ec_device_init = 1; acpi_remove_address_space_handler(ACPI_ROOT_OBJECT, ACPI_ADR_SPACE_EC, &acpi_ec_space_handler); diff -Nru a/drivers/char/sonypi.h b/drivers/char/sonypi.h --- a/drivers/char/sonypi.h Fri Oct 31 14:10:54 2003 +++ b/drivers/char/sonypi.h Fri Oct 31 14:10:54 2003 @@ -37,7 +37,7 @@ #ifdef __KERNEL__ #define SONYPI_DRIVER_MAJORVERSION 1 -#define SONYPI_DRIVER_MINORVERSION 20 +#define SONYPI_DRIVER_MINORVERSION 21 #define SONYPI_DEVICE_MODEL_TYPE1 1 #define SONYPI_DEVICE_MODEL_TYPE2 2 @@ -329,8 +329,8 @@ { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_PKEY_MASK, sonypi_pkeyev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x11, SONYPI_BACK_MASK, sonypi_backev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_HELP_MASK, sonypi_helpev }, - { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_ZOOM_MASK, sonypi_zoomev }, - { SONYPI_DEVICE_MODEL_TYPE2, 0x08, SONYPI_THUMBPHRASE_MASK, sonypi_thumbphraseev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0x21, SONYPI_ZOOM_MASK, sonypi_zoomev }, + { SONYPI_DEVICE_MODEL_TYPE2, 0x20, SONYPI_THUMBPHRASE_MASK, sonypi_thumbphraseev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x31, SONYPI_MEMORYSTICK_MASK, sonypi_memorystickev }, { SONYPI_DEVICE_MODEL_TYPE2, 0x41, SONYPI_BATTERY_MASK, sonypi_batteryev }, diff -Nru a/drivers/char/tty_io.c b/drivers/char/tty_io.c --- a/drivers/char/tty_io.c Fri Oct 31 14:10:54 2003 +++ b/drivers/char/tty_io.c Fri Oct 31 14:10:54 2003 @@ -2419,12 +2419,17 @@ static struct cdev vc0_cdev; #endif +static int tty_initialized = 0; + /* * Ok, now we can initialize the rest of the tty devices and can count * on memory allocations, interrupts etc.. */ -static int __init tty_init(void) +int __init tty_init(void) { + if (tty_initialized) + return 0; + tty_initialized = 1; strcpy(tty_cdev.kobj.name, "dev.tty"); cdev_init(&tty_cdev, &tty_fops); if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || diff -Nru a/drivers/char/tty_io.c~uml-tty-init.diff b/drivers/char/tty_io.c~uml-tty-init.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/drivers/char/tty_io.c~uml-tty-init.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,2470 @@ +/* + * linux/drivers/char/tty_io.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'tty_io.c' gives an orthogonal feeling to tty's, be they consoles + * or rs-channels. It also implements echoing, cooked mode etc. + * + * Kill-line thanks to John T Kohl, who also corrected VMIN = VTIME = 0. + * + * Modified by Theodore Ts'o, 9/14/92, to dynamically allocate the + * tty_struct and tty_queue structures. Previously there was an array + * of 256 tty_struct's which was statically allocated, and the + * tty_queue structures were allocated at boot time. Both are now + * dynamically allocated only when the tty is open. + * + * Also restructured routines so that there is more of a separation + * between the high-level tty routines (tty_io.c and tty_ioctl.c) and + * the low-level tty routines (serial.c, pty.c, console.c). This + * makes for cleaner and more compact code. -TYT, 9/17/92 + * + * Modified by Fred N. van Kempen, 01/29/93, to add line disciplines + * which can be dynamically activated and de-activated by the line + * discipline handling modules (like SLIP). + * + * NOTE: pay no attention to the line discipline code (yet); its + * interface is still subject to change in this version... + * -- TYT, 1/31/92 + * + * Added functionality to the OPOST tty handling. No delays, but all + * other bits should be there. + * -- Nick Holloway <alfie@dcs.warwick.ac.uk>, 27th May 1993. + * + * Rewrote canonical mode and added more termios flags. + * -- julian@uhunix.uhcc.hawaii.edu (J. Cowley), 13Jan94 + * + * Reorganized FASYNC support so mouse code can share it. + * -- ctm@ardi.com, 9Sep95 + * + * New TIOCLINUX variants added. + * -- mj@k332.feld.cvut.cz, 19-Nov-95 + * + * Restrict vt switching via ioctl() + * -- grif@cs.ucr.edu, 5-Dec-95 + * + * Move console and virtual terminal code to more appropriate files, + * implement CONFIG_VT and generalize console device interface. + * -- Marko Kohtala <Marko.Kohtala@hut.fi>, March 97 + * + * Rewrote init_dev and release_dev to eliminate races. + * -- Bill Hawes <whawes@star.net>, June 97 + * + * Added devfs support. + * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 13-Jan-1998 + * + * Added support for a Unix98-style ptmx device. + * -- C. Scott Ananian <cananian@alumni.princeton.edu>, 14-Jan-1998 + * + * Reduced memory usage for older ARM systems + * -- Russell King <rmk@arm.linux.org.uk> + * + * Move do_SAK() into process context. Less stack use in devfs functions. + * alloc_tty_struct() always uses kmalloc() -- Andrew Morton <andrewm@uow.edu.eu> 17Mar01 + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/major.h> +#include <linux/errno.h> +#include <linux/signal.h> +#include <linux/fcntl.h> +#include <linux/sched.h> +#include <linux/interrupt.h> +#include <linux/tty.h> +#include <linux/tty_driver.h> +#include <linux/tty_flip.h> +#include <linux/devpts_fs.h> +#include <linux/file.h> +#include <linux/console.h> +#include <linux/timer.h> +#include <linux/ctype.h> +#include <linux/kd.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/smp_lock.h> +#include <linux/device.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/bitops.h> + +#include <linux/kbd_kern.h> +#include <linux/vt_kern.h> +#include <linux/selection.h> +#include <linux/devfs_fs_kernel.h> + +#include <linux/kmod.h> + +#undef TTY_DEBUG_HANGUP + +#define TTY_PARANOIA_CHECK 1 +#define CHECK_TTY_COUNT 1 + +struct termios tty_std_termios = { /* for the benefit of tty drivers */ + .c_iflag = ICRNL | IXON, + .c_oflag = OPOST | ONLCR, + .c_cflag = B38400 | CS8 | CREAD | HUPCL, + .c_lflag = ISIG | ICANON | ECHO | ECHOE | ECHOK | + ECHOCTL | ECHOKE | IEXTEN, + .c_cc = INIT_C_CC +}; + +EXPORT_SYMBOL(tty_std_termios); + +LIST_HEAD(tty_drivers); /* linked list of tty drivers */ +struct tty_ldisc ldiscs[NR_LDISCS]; /* line disc dispatch table */ + +#ifdef CONFIG_UNIX98_PTYS +extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ +extern struct tty_driver *pts_driver; /* Unix98 pty slaves; for /dev/ptmx */ +#endif + +extern void disable_early_printk(void); + +static void initialize_tty_struct(struct tty_struct *tty); + +static ssize_t tty_read(struct file *, char *, size_t, loff_t *); +static ssize_t tty_write(struct file *, const char *, size_t, loff_t *); +ssize_t redirected_tty_write(struct file *, const char *, size_t, loff_t *); +static unsigned int tty_poll(struct file *, poll_table *); +static int tty_open(struct inode *, struct file *); +static int tty_release(struct inode *, struct file *); +int tty_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg); +static int tty_fasync(int fd, struct file * filp, int on); +extern void rs_360_init(void); + +static struct tty_struct *alloc_tty_struct(void) +{ + struct tty_struct *tty; + + tty = kmalloc(sizeof(struct tty_struct), GFP_KERNEL); + if (tty) + memset(tty, 0, sizeof(struct tty_struct)); + return tty; +} + +static inline void free_tty_struct(struct tty_struct *tty) +{ + kfree(tty); +} + +#define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base) + +char *tty_name(struct tty_struct *tty, char *buf) +{ + if (!tty) /* Hmm. NULL pointer. That's fun. */ + strcpy(buf, "NULL tty"); + else + strcpy(buf, tty->name); + return buf; +} + +EXPORT_SYMBOL(tty_name); + +inline int tty_paranoia_check(struct tty_struct *tty, struct inode *inode, + const char *routine) +{ +#ifdef TTY_PARANOIA_CHECK + if (!tty) { + printk(KERN_WARNING + "null TTY for (%d:%d) in %s\n", + imajor(inode), iminor(inode), routine); + return 1; + } + if (tty->magic != TTY_MAGIC) { + printk(KERN_WARNING + "bad magic number for tty struct (%d:%d) in %s\n", + imajor(inode), iminor(inode), routine); + return 1; + } +#endif + return 0; +} + +static int check_tty_count(struct tty_struct *tty, const char *routine) +{ +#ifdef CHECK_TTY_COUNT + struct list_head *p; + int count = 0; + + file_list_lock(); + list_for_each(p, &tty->tty_files) { + count++; + } + file_list_unlock(); + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_SLAVE && + tty->link && tty->link->count) + count++; + if (tty->count != count) { + printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " + "!= #fd's(%d) in %s\n", + tty->name, tty->count, count, routine); + return count; + } +#endif + return 0; +} + +int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) +{ + if (disc < N_TTY || disc >= NR_LDISCS) + return -EINVAL; + + if (new_ldisc) { + ldiscs[disc] = *new_ldisc; + ldiscs[disc].flags |= LDISC_FLAG_DEFINED; + ldiscs[disc].num = disc; + } else + memset(&ldiscs[disc], 0, sizeof(struct tty_ldisc)); + + return 0; +} + +EXPORT_SYMBOL(tty_register_ldisc); + +/* Set the discipline of a tty line. */ +static int tty_set_ldisc(struct tty_struct *tty, int ldisc) +{ + int retval = 0; + struct tty_ldisc o_ldisc; + char buf[64]; + + if ((ldisc < N_TTY) || (ldisc >= NR_LDISCS)) + return -EINVAL; + /* Eduardo Blanco <ejbs@cs.cs.com.uy> */ + /* Cyrus Durgin <cider@speakeasy.org> */ + if (!(ldiscs[ldisc].flags & LDISC_FLAG_DEFINED)) { + request_module("tty-ldisc-%d", ldisc); + } + if (!(ldiscs[ldisc].flags & LDISC_FLAG_DEFINED)) + return -EINVAL; + + if (tty->ldisc.num == ldisc) + return 0; /* We are already in the desired discipline */ + + if (!try_module_get(ldiscs[ldisc].owner)) + return -EINVAL; + + o_ldisc = tty->ldisc; + + tty_wait_until_sent(tty, 0); + + /* Shutdown the current discipline. */ + if (tty->ldisc.close) + (tty->ldisc.close)(tty); + + /* Now set up the new line discipline. */ + tty->ldisc = ldiscs[ldisc]; + tty->termios->c_line = ldisc; + if (tty->ldisc.open) + retval = (tty->ldisc.open)(tty); + if (retval < 0) { + tty->ldisc = o_ldisc; + tty->termios->c_line = tty->ldisc.num; + if (tty->ldisc.open && (tty->ldisc.open(tty) < 0)) { + tty->ldisc = ldiscs[N_TTY]; + tty->termios->c_line = N_TTY; + if (tty->ldisc.open) { + int r = tty->ldisc.open(tty); + + if (r < 0) + panic("Couldn't open N_TTY ldisc for " + "%s --- error %d.", + tty_name(tty, buf), r); + } + } + } else { + module_put(o_ldisc.owner); + } + + if (tty->ldisc.num != o_ldisc.num && tty->driver->set_ldisc) + tty->driver->set_ldisc(tty); + return retval; +} + +/* + * This routine returns a tty driver structure, given a device number + */ +struct tty_driver *get_tty_driver(dev_t device, int *index) +{ + struct tty_driver *p; + + list_for_each_entry(p, &tty_drivers, tty_drivers) { + dev_t base = MKDEV(p->major, p->minor_start); + if (device < base || device >= base + p->num) + continue; + *index = device - base; + return p; + } + return NULL; +} + +/* + * If we try to write to, or set the state of, a terminal and we're + * not in the foreground, send a SIGTTOU. If the signal is blocked or + * ignored, go ahead and perform the operation. (POSIX 7.2) + */ +int tty_check_change(struct tty_struct * tty) +{ + if (current->tty != tty) + return 0; + if (tty->pgrp <= 0) { + printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n"); + return 0; + } + if (process_group(current) == tty->pgrp) + return 0; + if (is_ignored(SIGTTOU)) + return 0; + if (is_orphaned_pgrp(process_group(current))) + return -EIO; + (void) kill_pg(process_group(current), SIGTTOU, 1); + return -ERESTARTSYS; +} + +EXPORT_SYMBOL(tty_check_change); + +static ssize_t hung_up_tty_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + /* Can't seek (pread) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + return 0; +} + +static ssize_t hung_up_tty_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + /* Can't seek (pwrite) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + return -EIO; +} + +/* No kernel lock held - none needed ;) */ +static unsigned int hung_up_tty_poll(struct file * filp, poll_table * wait) +{ + return POLLIN | POLLOUT | POLLERR | POLLHUP | POLLRDNORM | POLLWRNORM; +} + +static int hung_up_tty_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg) +{ + return cmd == TIOCSPGRP ? -ENOTTY : -EIO; +} + +static struct file_operations tty_fops = { + .llseek = no_llseek, + .read = tty_read, + .write = tty_write, + .poll = tty_poll, + .ioctl = tty_ioctl, + .open = tty_open, + .release = tty_release, + .fasync = tty_fasync, +}; + +static struct file_operations console_fops = { + .llseek = no_llseek, + .read = tty_read, + .write = redirected_tty_write, + .poll = tty_poll, + .ioctl = tty_ioctl, + .open = tty_open, + .release = tty_release, + .fasync = tty_fasync, +}; + +static struct file_operations hung_up_tty_fops = { + .llseek = no_llseek, + .read = hung_up_tty_read, + .write = hung_up_tty_write, + .poll = hung_up_tty_poll, + .ioctl = hung_up_tty_ioctl, + .release = tty_release, +}; + +static spinlock_t redirect_lock = SPIN_LOCK_UNLOCKED; +static struct file *redirect; +/* + * This can be called by the "eventd" kernel thread. That is process synchronous, + * but doesn't hold any locks, so we need to make sure we have the appropriate + * locks for what we're doing.. + */ +void do_tty_hangup(void *data) +{ + struct tty_struct *tty = (struct tty_struct *) data; + struct file * cons_filp = NULL; + struct file *filp, *f = NULL; + struct task_struct *p; + struct pid *pid; + int closecount = 0, n; + + if (!tty) + return; + + /* inuse_filps is protected by the single kernel lock */ + lock_kernel(); + + spin_lock(&redirect_lock); + if (redirect && redirect->private_data == tty) { + f = redirect; + redirect = NULL; + } + spin_unlock(&redirect_lock); + if (f) + fput(f); + + check_tty_count(tty, "do_tty_hangup"); + file_list_lock(); + list_for_each_entry(filp, &tty->tty_files, f_list) { + if (filp->f_op->write == redirected_tty_write) + cons_filp = filp; + if (filp->f_op->write != tty_write) + continue; + closecount++; + tty_fasync(-1, filp, 0); /* can't block */ + filp->f_op = &hung_up_tty_fops; + } + file_list_unlock(); + + /* FIXME! What are the locking issues here? This may me overdoing things.. + * this question is especially important now that we've removed the irqlock. */ + { + unsigned long flags; + + local_irq_save(flags); // FIXME: is this safe? + if (tty->ldisc.flush_buffer) + tty->ldisc.flush_buffer(tty); + if (tty->driver->flush_buffer) + tty->driver->flush_buffer(tty); + if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) && + tty->ldisc.write_wakeup) + (tty->ldisc.write_wakeup)(tty); + local_irq_restore(flags); // FIXME: is this safe? + } + + wake_up_interruptible(&tty->write_wait); + wake_up_interruptible(&tty->read_wait); + + /* + * Shutdown the current line discipline, and reset it to + * N_TTY. + */ + if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) + *tty->termios = tty->driver->init_termios; + if (tty->ldisc.num != ldiscs[N_TTY].num) { + if (tty->ldisc.close) + (tty->ldisc.close)(tty); + module_put(tty->ldisc.owner); + + tty->ldisc = ldiscs[N_TTY]; + tty->termios->c_line = N_TTY; + if (tty->ldisc.open) { + int i = (tty->ldisc.open)(tty); + if (i < 0) + printk(KERN_ERR "do_tty_hangup: N_TTY open: " + "error %d\n", -i); + } + } + + read_lock(&tasklist_lock); + if (tty->session > 0) { + struct list_head *l; + for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) { + if (p->tty == tty) + p->tty = NULL; + if (!p->leader) + continue; + send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p); + send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p); + if (tty->pgrp > 0) + p->tty_old_pgrp = tty->pgrp; + } + } + read_unlock(&tasklist_lock); + + tty->flags = 0; + tty->session = 0; + tty->pgrp = -1; + tty->ctrl_status = 0; + /* + * If one of the devices matches a console pointer, we + * cannot just call hangup() because that will cause + * tty->count and state->count to go out of sync. + * So we just call close() the right number of times. + */ + if (cons_filp) { + if (tty->driver->close) + for (n = 0; n < closecount; n++) + tty->driver->close(tty, cons_filp); + } else if (tty->driver->hangup) + (tty->driver->hangup)(tty); + unlock_kernel(); +} + +void tty_hangup(struct tty_struct * tty) +{ +#ifdef TTY_DEBUG_HANGUP + char buf[64]; + + printk(KERN_DEBUG "%s hangup...\n", tty_name(tty, buf)); +#endif + schedule_work(&tty->hangup_work); +} + +EXPORT_SYMBOL(tty_hangup); + +void tty_vhangup(struct tty_struct * tty) +{ +#ifdef TTY_DEBUG_HANGUP + char buf[64]; + + printk(KERN_DEBUG "%s vhangup...\n", tty_name(tty, buf)); +#endif + do_tty_hangup((void *) tty); +} +EXPORT_SYMBOL(tty_vhangup); + +int tty_hung_up_p(struct file * filp) +{ + return (filp->f_op == &hung_up_tty_fops); +} + +EXPORT_SYMBOL(tty_hung_up_p); + +/* + * This function is typically called only by the session leader, when + * it wants to disassociate itself from its controlling tty. + * + * It performs the following functions: + * (1) Sends a SIGHUP and SIGCONT to the foreground process group + * (2) Clears the tty from being controlling the session + * (3) Clears the controlling tty for all processes in the + * session group. + * + * The argument on_exit is set to 1 if called when a process is + * exiting; it is 0 if called by the ioctl TIOCNOTTY. + */ +void disassociate_ctty(int on_exit) +{ + struct tty_struct *tty; + struct task_struct *p; + struct list_head *l; + struct pid *pid; + int tty_pgrp = -1; + + lock_kernel(); + + tty = current->tty; + if (tty) { + tty_pgrp = tty->pgrp; + if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) + tty_vhangup(tty); + } else { + if (current->tty_old_pgrp) { + kill_pg(current->tty_old_pgrp, SIGHUP, on_exit); + kill_pg(current->tty_old_pgrp, SIGCONT, on_exit); + } + unlock_kernel(); + return; + } + if (tty_pgrp > 0) { + kill_pg(tty_pgrp, SIGHUP, on_exit); + if (!on_exit) + kill_pg(tty_pgrp, SIGCONT, on_exit); + } + + current->tty_old_pgrp = 0; + tty->session = 0; + tty->pgrp = -1; + + read_lock(&tasklist_lock); + for_each_task_pid(current->session, PIDTYPE_SID, p, l, pid) + p->tty = NULL; + read_unlock(&tasklist_lock); + unlock_kernel(); +} + +void stop_tty(struct tty_struct *tty) +{ + if (tty->stopped) + return; + tty->stopped = 1; + if (tty->link && tty->link->packet) { + tty->ctrl_status &= ~TIOCPKT_START; + tty->ctrl_status |= TIOCPKT_STOP; + wake_up_interruptible(&tty->link->read_wait); + } + if (tty->driver->stop) + (tty->driver->stop)(tty); +} + +EXPORT_SYMBOL(stop_tty); + +void start_tty(struct tty_struct *tty) +{ + if (!tty->stopped || tty->flow_stopped) + return; + tty->stopped = 0; + if (tty->link && tty->link->packet) { + tty->ctrl_status &= ~TIOCPKT_STOP; + tty->ctrl_status |= TIOCPKT_START; + wake_up_interruptible(&tty->link->read_wait); + } + if (tty->driver->start) + (tty->driver->start)(tty); + if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) && + tty->ldisc.write_wakeup) + (tty->ldisc.write_wakeup)(tty); + wake_up_interruptible(&tty->write_wait); +} + +EXPORT_SYMBOL(start_tty); + +static ssize_t tty_read(struct file * file, char * buf, size_t count, + loff_t *ppos) +{ + int i; + struct tty_struct * tty; + struct inode *inode; + + /* Can't seek (pread) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + + tty = (struct tty_struct *)file->private_data; + inode = file->f_dentry->d_inode; + if (tty_paranoia_check(tty, inode, "tty_read")) + return -EIO; + if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags))) + return -EIO; + + lock_kernel(); + if (tty->ldisc.read) + i = (tty->ldisc.read)(tty,file,buf,count); + else + i = -EIO; + unlock_kernel(); + if (i > 0) + inode->i_atime = CURRENT_TIME; + return i; +} + +/* + * Split writes up in sane blocksizes to avoid + * denial-of-service type attacks + */ +static inline ssize_t do_tty_write( + ssize_t (*write)(struct tty_struct *, struct file *, const unsigned char *, size_t), + struct tty_struct *tty, + struct file *file, + const unsigned char *buf, + size_t count) +{ + ssize_t ret = 0, written = 0; + + if (down_interruptible(&tty->atomic_write)) { + return -ERESTARTSYS; + } + if ( test_bit(TTY_NO_WRITE_SPLIT, &tty->flags) ) { + lock_kernel(); + written = write(tty, file, buf, count); + unlock_kernel(); + } else { + for (;;) { + unsigned long size = max((unsigned long)PAGE_SIZE*2, 16384UL); + if (size > count) + size = count; + lock_kernel(); + ret = write(tty, file, buf, size); + unlock_kernel(); + if (ret <= 0) + break; + written += ret; + buf += ret; + count -= ret; + if (!count) + break; + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + cond_resched(); + } + } + if (written) { + file->f_dentry->d_inode->i_mtime = CURRENT_TIME; + ret = written; + } + up(&tty->atomic_write); + return ret; +} + + +static ssize_t tty_write(struct file * file, const char * buf, size_t count, + loff_t *ppos) +{ + struct tty_struct * tty; + struct inode *inode = file->f_dentry->d_inode; + + /* Can't seek (pwrite) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + + tty = (struct tty_struct *)file->private_data; + if (tty_paranoia_check(tty, inode, "tty_write")) + return -EIO; + if (!tty || !tty->driver->write || (test_bit(TTY_IO_ERROR, &tty->flags))) + return -EIO; + if (!tty->ldisc.write) + return -EIO; + return do_tty_write(tty->ldisc.write, tty, file, + (const unsigned char *)buf, count); +} + +ssize_t redirected_tty_write(struct file * file, const char * buf, size_t count, + loff_t *ppos) +{ + struct file *p = NULL; + + spin_lock(&redirect_lock); + if (redirect) { + get_file(redirect); + p = redirect; + } + spin_unlock(&redirect_lock); + + if (p) { + ssize_t res; + /* Can't seek (pwrite) on ttys. */ + if (ppos != &file->f_pos) + return -ESPIPE; + res = vfs_write(p, buf, count, &p->f_pos); + fput(p); + return res; + } + + return tty_write(file, buf, count, ppos); +} + +/* Semaphore to protect creating and releasing a tty */ +static DECLARE_MUTEX(tty_sem); + +static void down_tty_sem(int index) +{ + down(&tty_sem); +} + +static void up_tty_sem(int index) +{ + up(&tty_sem); +} + +static void release_mem(struct tty_struct *tty, int idx); + +static inline void tty_line_name(struct tty_driver *driver, int index, char *p) +{ + sprintf(p, "%s%d", driver->name, index + driver->name_base); +} + +/* + * WSH 06/09/97: Rewritten to remove races and properly clean up after a + * failed open. The new code protects the open with a semaphore, so it's + * really quite straightforward. The semaphore locking can probably be + * relaxed for the (most common) case of reopening a tty. + */ +static int init_dev(struct tty_driver *driver, int idx, + struct tty_struct **ret_tty) +{ + struct tty_struct *tty, *o_tty; + struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; + struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; + int retval=0; + + /* + * Check whether we need to acquire the tty semaphore to avoid + * race conditions. For now, play it safe. + */ + down_tty_sem(idx); + + /* check whether we're reopening an existing tty */ + tty = driver->ttys[idx]; + if (tty) goto fast_track; + + /* + * First time open is complex, especially for PTY devices. + * This code guarantees that either everything succeeds and the + * TTY is ready for operation, or else the table slots are vacated + * and the allocated memory released. (Except that the termios + * and locked termios may be retained.) + */ + + if (!try_module_get(driver->owner)) { + retval = -ENODEV; + goto end_init; + } + + o_tty = NULL; + tp = o_tp = NULL; + ltp = o_ltp = NULL; + + tty = alloc_tty_struct(); + if(!tty) + goto fail_no_mem; + initialize_tty_struct(tty); + tty->driver = driver; + tty->index = idx; + tty_line_name(driver, idx, tty->name); + + tp_loc = &driver->termios[idx]; + if (!*tp_loc) { + tp = (struct termios *) kmalloc(sizeof(struct termios), + GFP_KERNEL); + if (!tp) + goto free_mem_out; + *tp = driver->init_termios; + } + + ltp_loc = &driver->termios_locked[idx]; + if (!*ltp_loc) { + ltp = (struct termios *) kmalloc(sizeof(struct termios), + GFP_KERNEL); + if (!ltp) + goto free_mem_out; + memset(ltp, 0, sizeof(struct termios)); + } + + if (driver->type == TTY_DRIVER_TYPE_PTY) { + o_tty = alloc_tty_struct(); + if (!o_tty) + goto free_mem_out; + initialize_tty_struct(o_tty); + o_tty->driver = driver->other; + o_tty->index = idx; + tty_line_name(driver->other, idx, o_tty->name); + + o_tp_loc = &driver->other->termios[idx]; + if (!*o_tp_loc) { + o_tp = (struct termios *) + kmalloc(sizeof(struct termios), GFP_KERNEL); + if (!o_tp) + goto free_mem_out; + *o_tp = driver->other->init_termios; + } + + o_ltp_loc = &driver->other->termios_locked[idx]; + if (!*o_ltp_loc) { + o_ltp = (struct termios *) + kmalloc(sizeof(struct termios), GFP_KERNEL); + if (!o_ltp) + goto free_mem_out; + memset(o_ltp, 0, sizeof(struct termios)); + } + + /* + * Everything allocated ... set up the o_tty structure. + */ + driver->other->ttys[idx] = o_tty; + if (!*o_tp_loc) + *o_tp_loc = o_tp; + if (!*o_ltp_loc) + *o_ltp_loc = o_ltp; + o_tty->termios = *o_tp_loc; + o_tty->termios_locked = *o_ltp_loc; + driver->other->refcount++; + if (driver->subtype == PTY_TYPE_MASTER) + o_tty->count++; + + /* Establish the links in both directions */ + tty->link = o_tty; + o_tty->link = tty; + } + + /* + * All structures have been allocated, so now we install them. + * Failures after this point use release_mem to clean up, so + * there's no need to null out the local pointers. + */ + driver->ttys[idx] = tty; + + if (!*tp_loc) + *tp_loc = tp; + if (!*ltp_loc) + *ltp_loc = ltp; + tty->termios = *tp_loc; + tty->termios_locked = *ltp_loc; + driver->refcount++; + tty->count++; + + /* + * Structures all installed ... call the ldisc open routines. + * If we fail here just call release_mem to clean up. No need + * to decrement the use counts, as release_mem doesn't care. + */ + if (tty->ldisc.open) { + retval = (tty->ldisc.open)(tty); + if (retval) + goto release_mem_out; + } + if (o_tty && o_tty->ldisc.open) { + retval = (o_tty->ldisc.open)(o_tty); + if (retval) { + if (tty->ldisc.close) + (tty->ldisc.close)(tty); + goto release_mem_out; + } + } + goto success; + + /* + * This fast open can be used if the tty is already open. + * No memory is allocated, and the only failures are from + * attempting to open a closing tty or attempting multiple + * opens on a pty master. + */ +fast_track: + if (test_bit(TTY_CLOSING, &tty->flags)) { + retval = -EIO; + goto end_init; + } + if (driver->type == TTY_DRIVER_TYPE_PTY && + driver->subtype == PTY_TYPE_MASTER) { + /* + * special case for PTY masters: only one open permitted, + * and the slave side open count is incremented as well. + */ + if (tty->count) { + retval = -EIO; + goto end_init; + } + tty->link->count++; + } + tty->count++; + tty->driver = driver; /* N.B. why do this every time?? */ + +success: + *ret_tty = tty; + + /* All paths come through here to release the semaphore */ +end_init: + up_tty_sem(idx); + return retval; + + /* Release locally allocated memory ... nothing placed in slots */ +free_mem_out: + if (o_tp) + kfree(o_tp); + if (o_tty) + free_tty_struct(o_tty); + if (ltp) + kfree(ltp); + if (tp) + kfree(tp); + free_tty_struct(tty); + +fail_no_mem: + module_put(driver->owner); + retval = -ENOMEM; + goto end_init; + + /* call the tty release_mem routine to clean out this slot */ +release_mem_out: + printk(KERN_INFO "init_dev: ldisc open failed, " + "clearing slot %d\n", idx); + release_mem(tty, idx); + goto end_init; +} + +/* + * Releases memory associated with a tty structure, and clears out the + * driver table slots. + */ +static void release_mem(struct tty_struct *tty, int idx) +{ + struct tty_struct *o_tty; + struct termios *tp; + + if ((o_tty = tty->link) != NULL) { + o_tty->driver->ttys[idx] = NULL; + if (o_tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) { + tp = o_tty->driver->termios[idx]; + o_tty->driver->termios[idx] = NULL; + kfree(tp); + } + o_tty->magic = 0; + o_tty->driver->refcount--; + file_list_lock(); + list_del_init(&o_tty->tty_files); + file_list_unlock(); + free_tty_struct(o_tty); + } + + tty->driver->ttys[idx] = NULL; + if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) { + tp = tty->driver->termios[idx]; + tty->driver->termios[idx] = NULL; + kfree(tp); + } + tty->magic = 0; + tty->driver->refcount--; + file_list_lock(); + list_del_init(&tty->tty_files); + file_list_unlock(); + module_put(tty->driver->owner); + free_tty_struct(tty); +} + +/* + * Even releasing the tty structures is a tricky business.. We have + * to be very careful that the structures are all released at the + * same time, as interrupts might otherwise get the wrong pointers. + * + * WSH 09/09/97: rewritten to avoid some nasty race conditions that could + * lead to double frees or releasing memory still in use. + */ +static void release_dev(struct file * filp) +{ + struct tty_struct *tty, *o_tty; + int pty_master, tty_closing, o_tty_closing, do_sleep; + int idx; + char buf[64]; + + tty = (struct tty_struct *)filp->private_data; + if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "release_dev")) + return; + + check_tty_count(tty, "release_dev"); + + tty_fasync(-1, filp, 0); + + idx = tty->index; + pty_master = (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_MASTER); + o_tty = tty->link; + +#ifdef TTY_PARANOIA_CHECK + if (idx < 0 || idx >= tty->driver->num) { + printk(KERN_DEBUG "release_dev: bad idx when trying to " + "free (%s)\n", tty->name); + return; + } + if (tty != tty->driver->ttys[idx]) { + printk(KERN_DEBUG "release_dev: driver.table[%d] not tty " + "for (%s)\n", idx, tty->name); + return; + } + if (tty->termios != tty->driver->termios[idx]) { + printk(KERN_DEBUG "release_dev: driver.termios[%d] not termios " + "for (%s)\n", + idx, tty->name); + return; + } + if (tty->termios_locked != tty->driver->termios_locked[idx]) { + printk(KERN_DEBUG "release_dev: driver.termios_locked[%d] not " + "termios_locked for (%s)\n", + idx, tty->name); + return; + } +#endif + +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "release_dev of %s (tty count=%d)...", + tty_name(tty, buf), tty->count); +#endif + +#ifdef TTY_PARANOIA_CHECK + if (tty->driver->other) { + if (o_tty != tty->driver->other->ttys[idx]) { + printk(KERN_DEBUG "release_dev: other->table[%d] " + "not o_tty for (%s)\n", + idx, tty->name); + return; + } + if (o_tty->termios != tty->driver->other->termios[idx]) { + printk(KERN_DEBUG "release_dev: other->termios[%d] " + "not o_termios for (%s)\n", + idx, tty->name); + return; + } + if (o_tty->termios_locked != + tty->driver->other->termios_locked[idx]) { + printk(KERN_DEBUG "release_dev: other->termios_locked[" + "%d] not o_termios_locked for (%s)\n", + idx, tty->name); + return; + } + if (o_tty->link != tty) { + printk(KERN_DEBUG "release_dev: bad pty pointers\n"); + return; + } + } +#endif + + if (tty->driver->close) + tty->driver->close(tty, filp); + + /* + * Sanity check: if tty->count is going to zero, there shouldn't be + * any waiters on tty->read_wait or tty->write_wait. We test the + * wait queues and kick everyone out _before_ actually starting to + * close. This ensures that we won't block while releasing the tty + * structure. + * + * The test for the o_tty closing is necessary, since the master and + * slave sides may close in any order. If the slave side closes out + * first, its count will be one, since the master side holds an open. + * Thus this test wouldn't be triggered at the time the slave closes, + * so we do it now. + * + * Note that it's possible for the tty to be opened again while we're + * flushing out waiters. By recalculating the closing flags before + * each iteration we avoid any problems. + */ + while (1) { + tty_closing = tty->count <= 1; + o_tty_closing = o_tty && + (o_tty->count <= (pty_master ? 1 : 0)); + do_sleep = 0; + + if (tty_closing) { + if (waitqueue_active(&tty->read_wait)) { + wake_up(&tty->read_wait); + do_sleep++; + } + if (waitqueue_active(&tty->write_wait)) { + wake_up(&tty->write_wait); + do_sleep++; + } + } + if (o_tty_closing) { + if (waitqueue_active(&o_tty->read_wait)) { + wake_up(&o_tty->read_wait); + do_sleep++; + } + if (waitqueue_active(&o_tty->write_wait)) { + wake_up(&o_tty->write_wait); + do_sleep++; + } + } + if (!do_sleep) + break; + + printk(KERN_WARNING "release_dev: %s: read/write wait queue " + "active!\n", tty_name(tty, buf)); + schedule(); + } + + /* + * The closing flags are now consistent with the open counts on + * both sides, and we've completed the last operation that could + * block, so it's safe to proceed with closing. + */ + if (pty_master) { + if (--o_tty->count < 0) { + printk(KERN_WARNING "release_dev: bad pty slave count " + "(%d) for %s\n", + o_tty->count, tty_name(o_tty, buf)); + o_tty->count = 0; + } + } + if (--tty->count < 0) { + printk(KERN_WARNING "release_dev: bad tty->count (%d) for %s\n", + tty->count, tty_name(tty, buf)); + tty->count = 0; + } + + /* + * We've decremented tty->count, so we need to remove this file + * descriptor off the tty->tty_files list; this serves two + * purposes: + * - check_tty_count sees the correct number of file descriptors + * associated with this tty. + * - do_tty_hangup no longer sees this file descriptor as + * something that needs to be handled for hangups. + */ + file_kill(filp); + filp->private_data = NULL; + + /* + * Perform some housekeeping before deciding whether to return. + * + * Set the TTY_CLOSING flag if this was the last open. In the + * case of a pty we may have to wait around for the other side + * to close, and TTY_CLOSING makes sure we can't be reopened. + */ + if(tty_closing) + set_bit(TTY_CLOSING, &tty->flags); + if(o_tty_closing) + set_bit(TTY_CLOSING, &o_tty->flags); + + /* + * If _either_ side is closing, make sure there aren't any + * processes that still think tty or o_tty is their controlling + * tty. + */ + if (tty_closing || o_tty_closing) { + struct task_struct *p; + struct list_head *l; + struct pid *pid; + + read_lock(&tasklist_lock); + for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + p->tty = NULL; + if (o_tty) + for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) + p->tty = NULL; + read_unlock(&tasklist_lock); + } + + /* check whether both sides are closing ... */ + if (!tty_closing || (o_tty && !o_tty_closing)) + return; + +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "freeing tty structure..."); +#endif + + /* + * Shutdown the current line discipline, and reset it to N_TTY. + * N.B. why reset ldisc when we're releasing the memory?? + */ + if (tty->ldisc.close) + (tty->ldisc.close)(tty); + module_put(tty->ldisc.owner); + + tty->ldisc = ldiscs[N_TTY]; + tty->termios->c_line = N_TTY; + if (o_tty) { + if (o_tty->ldisc.close) + (o_tty->ldisc.close)(o_tty); + module_put(o_tty->ldisc.owner); + o_tty->ldisc = ldiscs[N_TTY]; + } + + /* + * Prevent flush_to_ldisc() from rescheduling the work for later. Then + * kill any delayed work. + */ + clear_bit(TTY_DONT_FLIP, &tty->flags); + cancel_delayed_work(&tty->flip.work); + + /* + * Wait for ->hangup_work and ->flip.work handlers to terminate + */ + flush_scheduled_work(); + + /* + * The release_mem function takes care of the details of clearing + * the slots and preserving the termios structure. + */ + release_mem(tty, idx); +} + +/* + * tty_open and tty_release keep up the tty count that contains the + * number of opens done on a tty. We cannot use the inode-count, as + * different inodes might point to the same tty. + * + * Open-counting is needed for pty masters, as well as for keeping + * track of serial lines: DTR is dropped when the last close happens. + * (This is not done solely through tty->count, now. - Ted 1/27/92) + * + * The termios state of a pty is reset on first open so that + * settings don't persist across reuse. + */ +static int tty_open(struct inode * inode, struct file * filp) +{ + struct tty_struct *tty; + int noctty, retval; + struct tty_driver *driver; + int index; + dev_t device = inode->i_rdev; + unsigned short saved_flags = filp->f_flags; +retry_open: + noctty = filp->f_flags & O_NOCTTY; + if (device == MKDEV(TTYAUX_MAJOR,0)) { + if (!current->tty) + return -ENXIO; + driver = current->tty->driver; + index = current->tty->index; + filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */ + /* noctty = 1; */ + goto got_driver; + } +#ifdef CONFIG_VT + if (device == MKDEV(TTY_MAJOR,0)) { + extern int fg_console; + extern struct tty_driver *console_driver; + driver = console_driver; + index = fg_console; + noctty = 1; + goto got_driver; + } +#endif + if (device == MKDEV(TTYAUX_MAJOR,1)) { + struct console *c = console_drivers; + for (c = console_drivers; c; c = c->next) { + if (!c->device) + continue; + driver = c->device(c, &index); + if (!driver) + continue; + /* Don't let /dev/console block */ + filp->f_flags |= O_NONBLOCK; + noctty = 1; + goto got_driver; + } + return -ENODEV; + } + + if (device == MKDEV(TTYAUX_MAJOR,2)) { +#ifdef CONFIG_UNIX98_PTYS + /* find a device that is not in use. */ + retval = -1; + driver = ptm_driver; + for (index = 0; index < driver->num ; index++) + if (!init_dev(driver, index, &tty)) + goto ptmx_found; /* ok! */ + return -EIO; /* no free ptys */ + ptmx_found: + set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ + devpts_pty_new(index, MKDEV(pts_driver->major, pts_driver->minor_start) + index); + noctty = 1; +#else + return -ENODEV; +#endif /* CONFIG_UNIX_98_PTYS */ + } else { + driver = get_tty_driver(device, &index); + if (!driver) + return -ENODEV; +got_driver: + retval = init_dev(driver, index, &tty); + if (retval) + return retval; + } + + filp->private_data = tty; + file_move(filp, &tty->tty_files); + check_tty_count(tty, "tty_open"); + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_MASTER) + noctty = 1; +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "opening %s...", tty->name); +#endif + if (tty->driver->open) + retval = tty->driver->open(tty, filp); + else + retval = -ENODEV; + filp->f_flags = saved_flags; + + if (!retval && test_bit(TTY_EXCLUSIVE, &tty->flags) && !capable(CAP_SYS_ADMIN)) + retval = -EBUSY; + + if (retval) { +#ifdef TTY_DEBUG_HANGUP + printk(KERN_DEBUG "error %d in opening %s...", retval, + tty->name); +#endif + + release_dev(filp); + if (retval != -ERESTARTSYS) + return retval; + if (signal_pending(current)) + return retval; + schedule(); + /* + * Need to reset f_op in case a hangup happened. + */ + if (filp->f_op == &hung_up_tty_fops) + filp->f_op = &tty_fops; + goto retry_open; + } + if (!noctty && + current->leader && + !current->tty && + tty->session == 0) { + task_lock(current); + current->tty = tty; + task_unlock(current); + current->tty_old_pgrp = 0; + tty->session = current->session; + tty->pgrp = process_group(current); + } + return 0; +} + +static int tty_release(struct inode * inode, struct file * filp) +{ + lock_kernel(); + release_dev(filp); + unlock_kernel(); + return 0; +} + +/* No kernel lock held - fine */ +static unsigned int tty_poll(struct file * filp, poll_table * wait) +{ + struct tty_struct * tty; + + tty = (struct tty_struct *)filp->private_data; + if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "tty_poll")) + return 0; + + if (tty->ldisc.poll) + return (tty->ldisc.poll)(tty, filp, wait); + return 0; +} + +static int tty_fasync(int fd, struct file * filp, int on) +{ + struct tty_struct * tty; + int retval; + + tty = (struct tty_struct *)filp->private_data; + if (tty_paranoia_check(tty, filp->f_dentry->d_inode, "tty_fasync")) + return 0; + + retval = fasync_helper(fd, filp, on, &tty->fasync); + if (retval <= 0) + return retval; + + if (on) { + if (!waitqueue_active(&tty->read_wait)) + tty->minimum_to_wake = 1; + retval = f_setown(filp, (-tty->pgrp) ? : current->pid, 0); + if (retval) + return retval; + } else { + if (!tty->fasync && !waitqueue_active(&tty->read_wait)) + tty->minimum_to_wake = N_TTY_BUF_SIZE; + } + return 0; +} + +static int tiocsti(struct tty_struct *tty, char * arg) +{ + char ch, mbz = 0; + + if ((current->tty != tty) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ch, arg)) + return -EFAULT; + tty->ldisc.receive_buf(tty, &ch, &mbz, 1); + return 0; +} + +static int tiocgwinsz(struct tty_struct *tty, struct winsize * arg) +{ + if (copy_to_user(arg, &tty->winsize, sizeof(*arg))) + return -EFAULT; + return 0; +} + +static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty, + struct winsize * arg) +{ + struct winsize tmp_ws; + + if (copy_from_user(&tmp_ws, arg, sizeof(*arg))) + return -EFAULT; + if (!memcmp(&tmp_ws, &tty->winsize, sizeof(*arg))) + return 0; +#ifdef CONFIG_VT + if (tty->driver->type == TTY_DRIVER_TYPE_CONSOLE) { + unsigned int currcons = tty->index; + if (vc_resize(currcons, tmp_ws.ws_col, tmp_ws.ws_row)) + return -ENXIO; + } +#endif + if (tty->pgrp > 0) + kill_pg(tty->pgrp, SIGWINCH, 1); + if ((real_tty->pgrp != tty->pgrp) && (real_tty->pgrp > 0)) + kill_pg(real_tty->pgrp, SIGWINCH, 1); + tty->winsize = tmp_ws; + real_tty->winsize = tmp_ws; + return 0; +} + +static int tioccons(struct file *file) +{ + if (file->f_op->write == redirected_tty_write) { + struct file *f; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + spin_lock(&redirect_lock); + f = redirect; + redirect = NULL; + spin_unlock(&redirect_lock); + if (f) + fput(f); + return 0; + } + spin_lock(&redirect_lock); + if (redirect) { + spin_unlock(&redirect_lock); + return -EBUSY; + } + get_file(file); + redirect = file; + spin_unlock(&redirect_lock); + return 0; +} + + +static int fionbio(struct file *file, int *arg) +{ + int nonblock; + + if (get_user(nonblock, arg)) + return -EFAULT; + + if (nonblock) + file->f_flags |= O_NONBLOCK; + else + file->f_flags &= ~O_NONBLOCK; + return 0; +} + +static int tiocsctty(struct tty_struct *tty, int arg) +{ + struct list_head *l; + struct pid *pid; + task_t *p; + + if (current->leader && + (current->session == tty->session)) + return 0; + /* + * The process must be a session leader and + * not have a controlling tty already. + */ + if (!current->leader || current->tty) + return -EPERM; + if (tty->session > 0) { + /* + * This tty is already the controlling + * tty for another session group! + */ + if ((arg == 1) && capable(CAP_SYS_ADMIN)) { + /* + * Steal it away + */ + + read_lock(&tasklist_lock); + for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + p->tty = NULL; + read_unlock(&tasklist_lock); + } else + return -EPERM; + } + task_lock(current); + current->tty = tty; + task_unlock(current); + current->tty_old_pgrp = 0; + tty->session = current->session; + tty->pgrp = process_group(current); + return 0; +} + +static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) +{ + /* + * (tty == real_tty) is a cheap way of + * testing if the tty is NOT a master pty. + */ + if (tty == real_tty && current->tty != real_tty) + return -ENOTTY; + return put_user(real_tty->pgrp, arg); +} + +static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) +{ + pid_t pgrp; + int retval = tty_check_change(real_tty); + + if (retval == -EIO) + return -ENOTTY; + if (retval) + return retval; + if (!current->tty || + (current->tty != real_tty) || + (real_tty->session != current->session)) + return -ENOTTY; + if (get_user(pgrp, (pid_t *) arg)) + return -EFAULT; + if (pgrp < 0) + return -EINVAL; + if (session_of_pgrp(pgrp) != current->session) + return -EPERM; + real_tty->pgrp = pgrp; + return 0; +} + +static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *arg) +{ + /* + * (tty == real_tty) is a cheap way of + * testing if the tty is NOT a master pty. + */ + if (tty == real_tty && current->tty != real_tty) + return -ENOTTY; + if (real_tty->session <= 0) + return -ENOTTY; + return put_user(real_tty->session, arg); +} + +static int tiocsetd(struct tty_struct *tty, int *arg) +{ + int ldisc; + + if (get_user(ldisc, arg)) + return -EFAULT; + return tty_set_ldisc(tty, ldisc); +} + +static int send_break(struct tty_struct *tty, int duration) +{ + set_current_state(TASK_INTERRUPTIBLE); + + tty->driver->break_ctl(tty, -1); + if (!signal_pending(current)) + schedule_timeout(duration); + tty->driver->break_ctl(tty, 0); + if (signal_pending(current)) + return -EINTR; + return 0; +} + +static int +tty_tiocmget(struct tty_struct *tty, struct file *file, unsigned long arg) +{ + int retval = -EINVAL; + + if (tty->driver->tiocmget) { + retval = tty->driver->tiocmget(tty, file); + + if (retval >= 0) + retval = put_user(retval, (int *)arg); + } + return retval; +} + +static int +tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int cmd, + unsigned long arg) +{ + int retval = -EINVAL; + + if (tty->driver->tiocmset) { + unsigned int set, clear, val; + + retval = get_user(val, (unsigned int *)arg); + if (retval) + return retval; + + set = clear = 0; + switch (cmd) { + case TIOCMBIS: + set = val; + break; + case TIOCMBIC: + clear = val; + break; + case TIOCMSET: + set = val; + clear = ~val; + break; + } + + set &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; + clear &= TIOCM_DTR|TIOCM_RTS|TIOCM_OUT1|TIOCM_OUT2|TIOCM_LOOP; + + retval = tty->driver->tiocmset(tty, file, set, clear); + } + return retval; +} + +/* + * Split this up, as gcc can choke on it otherwise.. + */ +int tty_ioctl(struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg) +{ + struct tty_struct *tty, *real_tty; + int retval; + + tty = (struct tty_struct *)file->private_data; + if (tty_paranoia_check(tty, inode, "tty_ioctl")) + return -EINVAL; + + real_tty = tty; + if (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_MASTER) + real_tty = tty->link; + + /* + * Break handling by driver + */ + if (!tty->driver->break_ctl) { + switch(cmd) { + case TIOCSBRK: + case TIOCCBRK: + if (tty->driver->ioctl) + return tty->driver->ioctl(tty, file, cmd, arg); + return -EINVAL; + + /* These two ioctl's always return success; even if */ + /* the driver doesn't support them. */ + case TCSBRK: + case TCSBRKP: + if (!tty->driver->ioctl) + return 0; + retval = tty->driver->ioctl(tty, file, cmd, arg); + if (retval == -ENOIOCTLCMD) + retval = 0; + return retval; + } + } + + /* + * Factor out some common prep work + */ + switch (cmd) { + case TIOCSETD: + case TIOCSBRK: + case TIOCCBRK: + case TCSBRK: + case TCSBRKP: + retval = tty_check_change(tty); + if (retval) + return retval; + if (cmd != TIOCCBRK) { + tty_wait_until_sent(tty, 0); + if (signal_pending(current)) + return -EINTR; + } + break; + } + + switch (cmd) { + case TIOCSTI: + return tiocsti(tty, (char *)arg); + case TIOCGWINSZ: + return tiocgwinsz(tty, (struct winsize *) arg); + case TIOCSWINSZ: + return tiocswinsz(tty, real_tty, (struct winsize *) arg); + case TIOCCONS: + return real_tty!=tty ? -EINVAL : tioccons(file); + case FIONBIO: + return fionbio(file, (int *) arg); + case TIOCEXCL: + set_bit(TTY_EXCLUSIVE, &tty->flags); + return 0; + case TIOCNXCL: + clear_bit(TTY_EXCLUSIVE, &tty->flags); + return 0; + case TIOCNOTTY: + if (current->tty != tty) + return -ENOTTY; + if (current->leader) + disassociate_ctty(0); + task_lock(current); + current->tty = NULL; + task_unlock(current); + return 0; + case TIOCSCTTY: + return tiocsctty(tty, arg); + case TIOCGPGRP: + return tiocgpgrp(tty, real_tty, (pid_t *) arg); + case TIOCSPGRP: + return tiocspgrp(tty, real_tty, (pid_t *) arg); + case TIOCGSID: + return tiocgsid(tty, real_tty, (pid_t *) arg); + case TIOCGETD: + return put_user(tty->ldisc.num, (int *) arg); + case TIOCSETD: + return tiocsetd(tty, (int *) arg); +#ifdef CONFIG_VT + case TIOCLINUX: + return tioclinux(tty, arg); +#endif + /* + * Break handling + */ + case TIOCSBRK: /* Turn break on, unconditionally */ + tty->driver->break_ctl(tty, -1); + return 0; + + case TIOCCBRK: /* Turn break off, unconditionally */ + tty->driver->break_ctl(tty, 0); + return 0; + case TCSBRK: /* SVID version: non-zero arg --> no break */ + /* + * XXX is the above comment correct, or the + * code below correct? Is this ioctl used at + * all by anyone? + */ + if (!arg) + return send_break(tty, HZ/4); + return 0; + case TCSBRKP: /* support for POSIX tcsendbreak() */ + return send_break(tty, arg ? arg*(HZ/10) : HZ/4); + + case TIOCMGET: + return tty_tiocmget(tty, file, arg); + + case TIOCMSET: + case TIOCMBIC: + case TIOCMBIS: + return tty_tiocmset(tty, file, cmd, arg); + } + if (tty->driver->ioctl) { + int retval = (tty->driver->ioctl)(tty, file, cmd, arg); + if (retval != -ENOIOCTLCMD) + return retval; + } + if (tty->ldisc.ioctl) { + int retval = (tty->ldisc.ioctl)(tty, file, cmd, arg); + if (retval != -ENOIOCTLCMD) + return retval; + } + return -EINVAL; +} + + +/* + * This implements the "Secure Attention Key" --- the idea is to + * prevent trojan horses by killing all processes associated with this + * tty when the user hits the "Secure Attention Key". Required for + * super-paranoid applications --- see the Orange Book for more details. + * + * This code could be nicer; ideally it should send a HUP, wait a few + * seconds, then send a INT, and then a KILL signal. But you then + * have to coordinate with the init process, since all processes associated + * with the current tty must be dead before the new getty is allowed + * to spawn. + * + * Now, if it would be correct ;-/ The current code has a nasty hole - + * it doesn't catch files in flight. We may send the descriptor to ourselves + * via AF_UNIX socket, close it and later fetch from socket. FIXME. + * + * Nasty bug: do_SAK is being called in interrupt context. This can + * deadlock. We punt it up to process context. AKPM - 16Mar2001 + */ +static void __do_SAK(void *arg) +{ +#ifdef TTY_SOFT_SAK + tty_hangup(tty); +#else + struct tty_struct *tty = arg; + struct task_struct *p; + struct list_head *l; + struct pid *pid; + int session; + int i; + struct file *filp; + + if (!tty) + return; + session = tty->session; + if (tty->ldisc.flush_buffer) + tty->ldisc.flush_buffer(tty); + if (tty->driver->flush_buffer) + tty->driver->flush_buffer(tty); + read_lock(&tasklist_lock); + for_each_task_pid(session, PIDTYPE_SID, p, l, pid) { + if (p->tty == tty || session > 0) { + printk(KERN_NOTICE "SAK: killed process %d" + " (%s): p->session==tty->session\n", + p->pid, p->comm); + send_sig(SIGKILL, p, 1); + continue; + } + task_lock(p); + if (p->files) { + spin_lock(&p->files->file_lock); + for (i=0; i < p->files->max_fds; i++) { + filp = fcheck_files(p->files, i); + if (!filp) + continue; + if (filp->f_op->read == tty_read && + filp->private_data == tty) { + printk(KERN_NOTICE "SAK: killed process %d" + " (%s): fd#%d opened to the tty\n", + p->pid, p->comm, i); + send_sig(SIGKILL, p, 1); + break; + } + } + spin_unlock(&p->files->file_lock); + } + task_unlock(p); + } + read_unlock(&tasklist_lock); +#endif +} + +/* + * The tq handling here is a little racy - tty->SAK_work may already be queued. + * Fortunately we don't need to worry, because if ->SAK_work is already queued, + * the values which we write to it will be identical to the values which it + * already has. --akpm + */ +void do_SAK(struct tty_struct *tty) +{ + if (!tty) + return; + PREPARE_WORK(&tty->SAK_work, __do_SAK, tty); + schedule_work(&tty->SAK_work); +} + +EXPORT_SYMBOL(do_SAK); + +/* + * This routine is called out of the software interrupt to flush data + * from the flip buffer to the line discipline. + */ +static void flush_to_ldisc(void *private_) +{ + struct tty_struct *tty = (struct tty_struct *) private_; + unsigned char *cp; + char *fp; + int count; + unsigned long flags; + + if (test_bit(TTY_DONT_FLIP, &tty->flags)) { + /* + * Do it after the next timer tick: + */ + schedule_delayed_work(&tty->flip.work, 1); + return; + } + + spin_lock_irqsave(&tty->read_lock, flags); + if (tty->flip.buf_num) { + cp = tty->flip.char_buf + TTY_FLIPBUF_SIZE; + fp = tty->flip.flag_buf + TTY_FLIPBUF_SIZE; + tty->flip.buf_num = 0; + tty->flip.char_buf_ptr = tty->flip.char_buf; + tty->flip.flag_buf_ptr = tty->flip.flag_buf; + } else { + cp = tty->flip.char_buf; + fp = tty->flip.flag_buf; + tty->flip.buf_num = 1; + tty->flip.char_buf_ptr = tty->flip.char_buf + TTY_FLIPBUF_SIZE; + tty->flip.flag_buf_ptr = tty->flip.flag_buf + TTY_FLIPBUF_SIZE; + } + count = tty->flip.count; + tty->flip.count = 0; + spin_unlock_irqrestore(&tty->read_lock, flags); + + tty->ldisc.receive_buf(tty, cp, fp, count); +} + +/* + * Routine which returns the baud rate of the tty + * + * Note that the baud_table needs to be kept in sync with the + * include/asm/termbits.h file. + */ +static int baud_table[] = { + 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, + 9600, 19200, 38400, 57600, 115200, 230400, 460800, +#ifdef __sparc__ + 76800, 153600, 307200, 614400, 921600 +#else + 500000, 576000, 921600, 1000000, 1152000, 1500000, 2000000, + 2500000, 3000000, 3500000, 4000000 +#endif +}; + +static int n_baud_table = ARRAY_SIZE(baud_table); + +int tty_termios_baud_rate(struct termios *termios) +{ + unsigned int cbaud = termios->c_cflag & CBAUD; + + if (cbaud & CBAUDEX) { + cbaud &= ~CBAUDEX; + + if (cbaud < 1 || cbaud + 15 > n_baud_table) + termios->c_cflag &= ~CBAUDEX; + else + cbaud += 15; + } + + return baud_table[cbaud]; +} + +EXPORT_SYMBOL(tty_termios_baud_rate); + +int tty_get_baud_rate(struct tty_struct *tty) +{ + int baud = tty_termios_baud_rate(tty->termios); + + if (baud == 38400 && tty->alt_speed) { + if (!tty->warned) { + printk(KERN_WARNING "Use of setserial/setrocket to " + "set SPD_* flags is deprecated\n"); + tty->warned = 1; + } + baud = tty->alt_speed; + } + + return baud; +} + +EXPORT_SYMBOL(tty_get_baud_rate); + +void tty_flip_buffer_push(struct tty_struct *tty) +{ + if (tty->low_latency) + flush_to_ldisc((void *) tty); + else + schedule_delayed_work(&tty->flip.work, 1); +} + +EXPORT_SYMBOL(tty_flip_buffer_push); + +/* + * This subroutine initializes a tty structure. + */ +static void initialize_tty_struct(struct tty_struct *tty) +{ + memset(tty, 0, sizeof(struct tty_struct)); + tty->magic = TTY_MAGIC; + tty->ldisc = ldiscs[N_TTY]; + tty->pgrp = -1; + tty->flip.char_buf_ptr = tty->flip.char_buf; + tty->flip.flag_buf_ptr = tty->flip.flag_buf; + INIT_WORK(&tty->flip.work, flush_to_ldisc, tty); + init_MUTEX(&tty->flip.pty_sem); + init_waitqueue_head(&tty->write_wait); + init_waitqueue_head(&tty->read_wait); + INIT_WORK(&tty->hangup_work, do_tty_hangup, tty); + sema_init(&tty->atomic_read, 1); + sema_init(&tty->atomic_write, 1); + spin_lock_init(&tty->read_lock); + INIT_LIST_HEAD(&tty->tty_files); + INIT_WORK(&tty->SAK_work, NULL, NULL); +} + +/* + * The default put_char routine if the driver did not define one. + */ +static void tty_default_put_char(struct tty_struct *tty, unsigned char ch) +{ + tty->driver->write(tty, 0, &ch, 1); +} + +struct tty_dev { + struct list_head node; + dev_t dev; + struct class_device class_dev; +}; +#define to_tty_dev(d) container_of(d, struct tty_dev, class_dev) + +static void release_tty_dev(struct class_device *class_dev) +{ + struct tty_dev *tty_dev = to_tty_dev(class_dev); + kfree(tty_dev); +} + +static struct class tty_class = { + .name = "tty", + .release = &release_tty_dev, +}; + +static LIST_HEAD(tty_dev_list); +static spinlock_t tty_dev_list_lock = SPIN_LOCK_UNLOCKED; + +static ssize_t show_dev(struct class_device *class_dev, char *buf) +{ + struct tty_dev *tty_dev = to_tty_dev(class_dev); + return print_dev_t(buf, tty_dev->dev); +} +static CLASS_DEVICE_ATTR(dev, S_IRUGO, show_dev, NULL); + +static void tty_add_class_device(char *name, dev_t dev, struct device *device) +{ + struct tty_dev *tty_dev = NULL; + int retval; + + tty_dev = kmalloc(sizeof(*tty_dev), GFP_KERNEL); + if (!tty_dev) + return; + memset(tty_dev, 0x00, sizeof(*tty_dev)); + + tty_dev->class_dev.dev = device; + tty_dev->class_dev.class = &tty_class; + snprintf(tty_dev->class_dev.class_id, BUS_ID_SIZE, "%s", name); + retval = class_device_register(&tty_dev->class_dev); + if (retval) + goto error; + class_device_create_file (&tty_dev->class_dev, &class_device_attr_dev); + tty_dev->dev = dev; + spin_lock(&tty_dev_list_lock); + list_add(&tty_dev->node, &tty_dev_list); + spin_unlock(&tty_dev_list_lock); + return; +error: + kfree(tty_dev); +} + +static void tty_remove_class_device(dev_t dev) +{ + struct tty_dev *tty_dev = NULL; + struct list_head *tmp; + int found = 0; + + spin_lock(&tty_dev_list_lock); + list_for_each (tmp, &tty_dev_list) { + tty_dev = list_entry(tmp, struct tty_dev, node); + if (tty_dev->dev == dev) { + list_del(&tty_dev->node); + found = 1; + break; + } + } + spin_unlock(&tty_dev_list_lock); + if (found) + class_device_unregister(&tty_dev->class_dev); +} + +/** + * tty_register_device - register a tty device + * @driver: the tty driver that describes the tty device + * @index: the index in the tty driver for this tty device + * @device: a struct device that is associated with this tty device. + * This field is optional, if there is no known struct device for this + * tty device it can be set to NULL safely. + * + * This call is required to be made to register an individual tty device if + * the tty driver's flags have the TTY_DRIVER_NO_DEVFS bit set. If that + * bit is not set, this function should not be called. + */ +void tty_register_device(struct tty_driver *driver, unsigned index, + struct device *device) +{ + dev_t dev = MKDEV(driver->major, driver->minor_start) + index; + + if (index >= driver->num) { + printk(KERN_ERR "Attempt to register invalid tty line number " + " (%d).\n", index); + return; + } + + devfs_mk_cdev(dev, S_IFCHR | S_IRUSR | S_IWUSR, + "%s%d", driver->devfs_name, index + driver->name_base); + + /* we don't care about the ptys */ + /* how nice to hide this behind some crappy interface.. */ + if (driver->type != TTY_DRIVER_TYPE_PTY) { + char name[64]; + tty_line_name(driver, index, name); + tty_add_class_device(name, dev, device); + } +} + +/** + * tty_unregister_device - unregister a tty device + * @driver: the tty driver that describes the tty device + * @index: the index in the tty driver for this tty device + * + * If a tty device is registered with a call to tty_register_device() then + * this function must be made when the tty device is gone. + */ +void tty_unregister_device(struct tty_driver *driver, unsigned index) +{ + devfs_remove("%s%d", driver->devfs_name, index + driver->name_base); + tty_remove_class_device(MKDEV(driver->major, driver->minor_start) + index); +} + +EXPORT_SYMBOL(tty_register_device); +EXPORT_SYMBOL(tty_unregister_device); + +static struct kobject tty_kobj = {.name = "tty"}; + +struct tty_driver *alloc_tty_driver(int lines) +{ + struct tty_driver *driver; + + driver = kmalloc(sizeof(struct tty_driver), GFP_KERNEL); + if (driver) { + memset(driver, 0, sizeof(struct tty_driver)); + driver->magic = TTY_DRIVER_MAGIC; + driver->num = lines; + /* later we'll move allocation of tables here */ + } + return driver; +} + +void put_tty_driver(struct tty_driver *driver) +{ + kfree(driver); +} + +void tty_set_operations(struct tty_driver *driver, struct tty_operations *op) +{ + driver->open = op->open; + driver->close = op->close; + driver->write = op->write; + driver->put_char = op->put_char; + driver->flush_chars = op->flush_chars; + driver->write_room = op->write_room; + driver->chars_in_buffer = op->chars_in_buffer; + driver->ioctl = op->ioctl; + driver->set_termios = op->set_termios; + driver->throttle = op->throttle; + driver->unthrottle = op->unthrottle; + driver->stop = op->stop; + driver->start = op->start; + driver->hangup = op->hangup; + driver->break_ctl = op->break_ctl; + driver->flush_buffer = op->flush_buffer; + driver->set_ldisc = op->set_ldisc; + driver->wait_until_sent = op->wait_until_sent; + driver->send_xchar = op->send_xchar; + driver->read_proc = op->read_proc; + driver->write_proc = op->write_proc; + driver->tiocmget = op->tiocmget; + driver->tiocmset = op->tiocmset; +} + + +EXPORT_SYMBOL(alloc_tty_driver); +EXPORT_SYMBOL(put_tty_driver); +EXPORT_SYMBOL(tty_set_operations); + +/* + * Called by a tty driver to register itself. + */ +int tty_register_driver(struct tty_driver *driver) +{ + int error; + int i; + dev_t dev; + char *s; + void **p; + + if (driver->flags & TTY_DRIVER_INSTALLED) + return 0; + + p = kmalloc(driver->num * 3 * sizeof(void *), GFP_KERNEL); + if (!p) + return -ENOMEM; + memset(p, 0, driver->num * 3 * sizeof(void *)); + + if (!driver->major) { + error = alloc_chrdev_region(&dev, driver->minor_start, driver->num, + (char*)driver->name); + if (!error) { + driver->major = MAJOR(dev); + driver->minor_start = MINOR(dev); + } + } else { + dev = MKDEV(driver->major, driver->minor_start); + error = register_chrdev_region(dev, driver->num, + (char*)driver->name); + } + if (error < 0) { + kfree(p); + return error; + } + + driver->ttys = (struct tty_struct **)p; + driver->termios = (struct termios **)(p + driver->num); + driver->termios_locked = (struct termios **)(p + driver->num * 2); + + driver->cdev.kobj.parent = &tty_kobj; + strcpy(driver->cdev.kobj.name, driver->name); + for (s = strchr(driver->cdev.kobj.name, '/'); s; s = strchr(s, '/')) + *s = '!'; + cdev_init(&driver->cdev, &tty_fops); + driver->cdev.owner = driver->owner; + error = cdev_add(&driver->cdev, dev, driver->num); + if (error) { + kobject_del(&driver->cdev.kobj); + unregister_chrdev_region(dev, driver->num); + driver->ttys = NULL; + driver->termios = driver->termios_locked = NULL; + kfree(p); + return error; + } + + if (!driver->put_char) + driver->put_char = tty_default_put_char; + + list_add(&driver->tty_drivers, &tty_drivers); + + if ( !(driver->flags & TTY_DRIVER_NO_DEVFS) ) { + for(i = 0; i < driver->num; i++) + tty_register_device(driver, i, NULL); + } + proc_tty_register_driver(driver); + return 0; +} + +EXPORT_SYMBOL(tty_register_driver); + +/* + * Called by a tty driver to unregister itself. + */ +int tty_unregister_driver(struct tty_driver *driver) +{ + int i; + struct termios *tp; + void *p; + + if (driver->refcount) + return -EBUSY; + + cdev_unmap(MKDEV(driver->major, driver->minor_start), driver->num); + unregister_chrdev_region(MKDEV(driver->major, driver->minor_start), + driver->num); + + list_del(&driver->tty_drivers); + + /* + * Free the termios and termios_locked structures because + * we don't want to get memory leaks when modular tty + * drivers are removed from the kernel. + */ + for (i = 0; i < driver->num; i++) { + tp = driver->termios[i]; + if (tp) { + driver->termios[i] = NULL; + kfree(tp); + } + tp = driver->termios_locked[i]; + if (tp) { + driver->termios_locked[i] = NULL; + kfree(tp); + } + if (!(driver->flags & TTY_DRIVER_NO_DEVFS)) + tty_unregister_device(driver, i); + } + p = driver->ttys; + proc_tty_unregister_driver(driver); + driver->ttys = NULL; + driver->termios = driver->termios_locked = NULL; + kfree(p); + cdev_del(&driver->cdev); + return 0; +} + +EXPORT_SYMBOL(tty_unregister_driver); + + +/* + * Initialize the console device. This is called *early*, so + * we can't necessarily depend on lots of kernel help here. + * Just do some early initializations, and do the complex setup + * later. + */ +void __init console_init(void) +{ + initcall_t *call; + + /* Setup the default TTY line discipline. */ + (void) tty_register_ldisc(N_TTY, &tty_ldisc_N_TTY); + + /* + * set up the console device so that later boot sequences can + * inform about problems etc.. + */ +#ifdef CONFIG_EARLY_PRINTK + disable_early_printk(); +#endif +#ifdef CONFIG_SERIAL_68360 + /* This is not a console initcall. I know not what it's doing here. + So I haven't moved it. dwmw2 */ + rs_360_init(); +#endif + call = &__con_initcall_start; + while (call < &__con_initcall_end) { + (*call)(); + call++; + } +} + +#ifdef CONFIG_VT +extern int vty_init(void); +#endif + +static int __init tty_class_init(void) +{ + return class_register(&tty_class); +} + +postcore_initcall(tty_class_init); + +static struct cdev tty_cdev, console_cdev; +#ifdef CONFIG_UNIX98_PTYS +static struct cdev ptmx_cdev; +#endif +#ifdef CONFIG_VT +static struct cdev vc0_cdev; +#endif + +/* + * Ok, now we can initialize the rest of the tty devices and can count + * on memory allocations, interrupts etc.. + */ +static int __init tty_init(void) +{ + strcpy(tty_cdev.kobj.name, "dev.tty"); + cdev_init(&tty_cdev, &tty_fops); + if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || + register_chrdev_region(MKDEV(TTYAUX_MAJOR, 0), 1, "/dev/tty") < 0) + panic("Couldn't register /dev/tty driver\n"); + devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 0), S_IFCHR|S_IRUGO|S_IWUGO, "tty"); + tty_add_class_device ("tty", MKDEV(TTYAUX_MAJOR, 0), NULL); + + strcpy(console_cdev.kobj.name, "dev.console"); + cdev_init(&console_cdev, &console_fops); + if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) || + register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0) + panic("Couldn't register /dev/console driver\n"); + devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 1), S_IFCHR|S_IRUSR|S_IWUSR, "console"); + tty_add_class_device ("console", MKDEV(TTYAUX_MAJOR, 1), NULL); + + tty_kobj.kset = tty_cdev.kobj.kset; + kobject_register(&tty_kobj); + +#ifdef CONFIG_UNIX98_PTYS + strcpy(ptmx_cdev.kobj.name, "dev.ptmx"); + cdev_init(&ptmx_cdev, &tty_fops); + if (cdev_add(&ptmx_cdev, MKDEV(TTYAUX_MAJOR, 2), 1) || + register_chrdev_region(MKDEV(TTYAUX_MAJOR, 2), 1, "/dev/ptmx") < 0) + panic("Couldn't register /dev/ptmx driver\n"); + devfs_mk_cdev(MKDEV(TTYAUX_MAJOR, 2), S_IFCHR|S_IRUGO|S_IWUGO, "ptmx"); + tty_add_class_device ("ptmx", MKDEV(TTYAUX_MAJOR, 2), NULL); +#endif + +#ifdef CONFIG_VT + strcpy(vc0_cdev.kobj.name, "dev.vc0"); + cdev_init(&vc0_cdev, &console_fops); + if (cdev_add(&vc0_cdev, MKDEV(TTY_MAJOR, 0), 1) || + register_chrdev_region(MKDEV(TTY_MAJOR, 0), 1, "/dev/vc/0") < 0) + panic("Couldn't register /dev/tty0 driver\n"); + devfs_mk_cdev(MKDEV(TTY_MAJOR, 0), S_IFCHR|S_IRUSR|S_IWUSR, "vc/0"); + tty_add_class_device ("tty0", MKDEV(TTY_MAJOR, 0), NULL); + + vty_init(); +#endif + return 0; +} +module_init(tty_init); diff -Nru a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c --- a/drivers/input/keyboard/atkbd.c Fri Oct 31 14:10:54 2003 +++ b/drivers/input/keyboard/atkbd.c Fri Oct 31 14:10:54 2003 @@ -184,11 +184,19 @@ atkbd->resend = 0; #endif + switch (code) { + case ATKBD_RET_ACK: + atkbd->ack = 1; + goto out; + case ATKBD_RET_NAK: + atkbd->ack = -1; + goto out; + } + if (atkbd->translated) do { if (atkbd->emul != 1) { - if (code == ATKBD_RET_EMUL0 || code == ATKBD_RET_EMUL1 || - code == ATKBD_RET_ACK || code == ATKBD_RET_NAK) + if (code == ATKBD_RET_EMUL0 || code == ATKBD_RET_EMUL1) break; if (code == ATKBD_RET_BAT) { if (!atkbd->bat_xl) @@ -211,15 +219,6 @@ atkbd->release = 1; } while (0); - - switch (code) { - case ATKBD_RET_ACK: - atkbd->ack = 1; - goto out; - case ATKBD_RET_NAK: - atkbd->ack = -1; - goto out; - } if (atkbd->cmdcnt) { atkbd->cmdbuf[--atkbd->cmdcnt] = code; diff -Nru a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c --- a/drivers/input/mouse/psmouse-base.c Fri Oct 31 14:10:53 2003 +++ b/drivers/input/mouse/psmouse-base.c Fri Oct 31 14:10:53 2003 @@ -40,7 +40,7 @@ static int psmouse_noext; int psmouse_resolution; -unsigned int psmouse_rate = 60; +unsigned int psmouse_rate; int psmouse_smartscroll = PSMOUSE_LOGITECH_SMARTSCROLL; unsigned int psmouse_resetafter; @@ -471,13 +471,16 @@ * We set the mouse report rate. */ - psmouse_set_rate(psmouse); + if (psmouse_rate) + psmouse_set_rate(psmouse); /* * We also set the resolution and scaling. */ - psmouse_set_resolution(psmouse); + if (psmouse_resolution) + psmouse_set_resolution(psmouse); + psmouse_command(psmouse, NULL, PSMOUSE_CMD_SETSCALE11); /* @@ -651,10 +654,17 @@ return 1; } +static int __init psmouse_rate_setup(char *str) +{ + get_option(&str, &psmouse_rate); + return 1; +} + __setup("psmouse_noext", psmouse_noext_setup); __setup("psmouse_resolution=", psmouse_resolution_setup); __setup("psmouse_smartscroll=", psmouse_smartscroll_setup); __setup("psmouse_resetafter=", psmouse_resetafter_setup); +__setup("psmouse_rate=", psmouse_rate_setup); #endif diff -Nru a/drivers/md/dm-table.c b/drivers/md/dm-table.c --- a/drivers/md/dm-table.c Fri Oct 31 14:10:54 2003 +++ b/drivers/md/dm-table.c Fri Oct 31 14:10:54 2003 @@ -489,6 +489,18 @@ rs->max_sectors = min_not_zero(rs->max_sectors, q->max_sectors); + /* FIXME: Device-Mapper on top of RAID-0 breaks because DM + * currently doesn't honor MD's merge_bvec_fn routine. + * In this case, we'll force DM to use PAGE_SIZE or + * smaller I/O, just to be safe. A better fix is in the + * works, but add this for the time being so it will at + * least operate correctly. + */ + if (q->merge_bvec_fn) + rs->max_sectors = + min_not_zero(rs->max_sectors, + (unsigned short)(PAGE_SIZE >> 9)); + rs->max_phys_segments = min_not_zero(rs->max_phys_segments, q->max_phys_segments); diff -Nru a/drivers/media/video/bt832.c b/drivers/media/video/bt832.c --- a/drivers/media/video/bt832.c Fri Oct 31 14:10:54 2003 +++ b/drivers/media/video/bt832.c Fri Oct 31 14:10:54 2003 @@ -187,7 +187,6 @@ t->client.data = t; i2c_attach_client(&t->client); - MOD_INC_USE_COUNT; if(! bt832_init(&t->client)) { bt832_detach(&t->client); return -1; @@ -210,7 +209,6 @@ printk("bt832: detach.\n"); i2c_detach_client(client); kfree(t); - MOD_DEC_USE_COUNT; return 0; } diff -Nru a/drivers/media/video/bttv-cards.c b/drivers/media/video/bttv-cards.c --- a/drivers/media/video/bttv-cards.c Fri Oct 31 14:10:53 2003 +++ b/drivers/media/video/bttv-cards.c Fri Oct 31 14:10:53 2003 @@ -2439,8 +2439,8 @@ { TUNER_TEMIC_4039FR5_NTSC, "Temic 4039FR5" }, { TUNER_PHILIPS_FQ1216ME, "Philips FQ1216 ME" }, { TUNER_TEMIC_4066FY5_PAL_I, "Temic 4066FY5" }, - { TUNER_ABSENT, "Philips TD1536" }, - { TUNER_ABSENT, "Philips TD1536D" }, + { TUNER_PHILIPS_NTSC, "Philips TD1536" }, + { TUNER_PHILIPS_NTSC, "Philips TD1536D" }, { TUNER_PHILIPS_NTSC, "Philips FMR1236" }, /* mono radio */ { TUNER_ABSENT, "Philips FI1256MP" }, { TUNER_ABSENT, "Samsung TCPQ9091P" }, diff -Nru a/drivers/media/video/bttv-if.c b/drivers/media/video/bttv-if.c --- a/drivers/media/video/bttv-if.c Fri Oct 31 14:10:53 2003 +++ b/drivers/media/video/bttv-if.c Fri Oct 31 14:10:53 2003 @@ -247,7 +247,7 @@ bttv_i2c_wait_done(struct bttv *btv) { u32 stat; - int timeout; + unsigned long timeout; timeout = jiffies + HZ/100 + 1; /* 10ms */ for (;;) { diff -Nru a/drivers/media/video/meye.h b/drivers/media/video/meye.h --- a/drivers/media/video/meye.h Fri Oct 31 14:10:54 2003 +++ b/drivers/media/video/meye.h Fri Oct 31 14:10:54 2003 @@ -31,7 +31,7 @@ #define _MEYE_PRIV_H_ #define MEYE_DRIVER_MAJORVERSION 1 -#define MEYE_DRIVER_MINORVERSION 7 +#define MEYE_DRIVER_MINORVERSION 8 #include <linux/config.h> #include <linux/types.h> diff -Nru a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c --- a/drivers/media/video/saa5249.c Fri Oct 31 14:10:54 2003 +++ b/drivers/media/video/saa5249.c Fri Oct 31 14:10:54 2003 @@ -214,7 +214,6 @@ } t->client = client; i2c_attach_client(client); - MOD_INC_USE_COUNT; return 0; } @@ -237,7 +236,6 @@ kfree(vd->priv); kfree(vd); kfree(client); - MOD_DEC_USE_COUNT; return 0; } diff -Nru a/drivers/media/video/tuner-3036.c b/drivers/media/video/tuner-3036.c --- a/drivers/media/video/tuner-3036.c Fri Oct 31 14:10:53 2003 +++ b/drivers/media/video/tuner-3036.c Fri Oct 31 14:10:53 2003 @@ -134,7 +134,6 @@ printk("tuner: SAB3036 found, status %02x\n", tuner_getstatus(client)); i2c_attach_client(client); - MOD_INC_USE_COUNT; if (i2c_master_send(client, buffer, 2) != 2) printk("tuner: i2c i/o error 1\n"); @@ -148,7 +147,6 @@ static int tuner_detach(struct i2c_client *c) { - MOD_DEC_USE_COUNT; return 0; } diff -Nru a/drivers/net/3c527.c b/drivers/net/3c527.c --- a/drivers/net/3c527.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/3c527.c Fri Oct 31 14:10:54 2003 @@ -109,6 +109,8 @@ #include "3c527.h" +MODULE_LICENSE("GPL"); + /* * The name of the card. Is used for messages and in the requests for * io regions, irqs and dma channels diff -Nru a/drivers/net/8139too.c b/drivers/net/8139too.c --- a/drivers/net/8139too.c Fri Oct 31 14:10:53 2003 +++ b/drivers/net/8139too.c Fri Oct 31 14:10:53 2003 @@ -245,6 +245,7 @@ {0x1186, 0x1340, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 }, {0x13d1, 0xab06, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 }, {0x1259, 0xa117, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 }, + {0x1259, 0xa11e, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 }, {0x14ea, 0xab06, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 }, {0x14ea, 0xab07, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 }, {0x11db, 0x1234, PCI_ANY_ID, PCI_ANY_ID, 0, 0, RTL8139 }, diff -Nru a/drivers/net/arm/ether1.c b/drivers/net/arm/ether1.c --- a/drivers/net/arm/ether1.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/arm/ether1.c Fri Oct 31 14:10:54 2003 @@ -1036,13 +1036,8 @@ goto release; } - printk(KERN_INFO "%s: ether1 in slot %d, ", - dev->name, ec->slot_no); - - for (i = 0; i < 6; i++) { + for (i = 0; i < 6; i++) dev->dev_addr[i] = inb(IDPROM_ADDRESS + i); - printk ("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':'); - } if (ether1_init_2(dev)) { ret = -ENODEV; @@ -1060,6 +1055,12 @@ ret = register_netdev(dev); if (ret) goto release; + + printk(KERN_INFO "%s: ether1 in slot %d, ", + dev->name, ec->slot_no); + + for (i = 0; i < 6; i++) + printk ("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':'); ecard_set_drvdata(ec, dev); return 0; diff -Nru a/drivers/net/arm/ether3.c b/drivers/net/arm/ether3.c --- a/drivers/net/arm/ether3.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/arm/ether3.c Fri Oct 31 14:10:54 2003 @@ -881,10 +881,6 @@ break; } - printk("%s: %s in slot %d, ", dev->name, name, ec->slot_no); - for (i = 0; i < 6; i++) - printk("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':'); - if (ether3_init_2(dev)) { ret = -ENODEV; goto failed; @@ -901,6 +897,10 @@ ret = register_netdev(dev); if (ret) goto failed; + + printk("%s: %s in slot %d, ", dev->name, name, ec->slot_no); + for (i = 0; i < 6; i++) + printk("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':'); ecard_set_drvdata(ec, dev); return 0; diff -Nru a/drivers/net/arm/etherh.c b/drivers/net/arm/etherh.c --- a/drivers/net/arm/etherh.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/arm/etherh.c Fri Oct 31 14:10:54 2003 @@ -665,12 +665,6 @@ break; } - printk(KERN_INFO "%s: %s in slot %d, ", - dev->name, dev_type, ec->slot_no); - - for (i = 0; i < 6; i++) - printk("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':'); - ei_local = (struct ei_device *) dev->priv; if (ec->cid.product == PROD_ANT_ETHERM) { ei_local->tx_start_page = ETHERM_TX_START_PAGE; @@ -697,6 +691,12 @@ ret = register_netdev(dev); if (ret) goto release; + + printk(KERN_INFO "%s: %s in slot %d, ", + dev->name, dev_type, ec->slot_no); + + for (i = 0; i < 6; i++) + printk("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':'); ecard_set_drvdata(ec, dev); diff -Nru a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c --- a/drivers/net/bonding/bond_main.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/bonding/bond_main.c Fri Oct 31 14:10:54 2003 @@ -2238,8 +2238,9 @@ static void bond_mii_monitor(struct net_device *master) { bonding_t *bond = (struct bonding *) master->priv; - slave_t *slave, *bestslave, *oldcurrent; + slave_t *slave, *oldcurrent; int slave_died = 0; + int do_failover = 0; read_lock(&bond->lock); @@ -2249,7 +2250,6 @@ * program could monitor the link itself if needed. */ - bestslave = NULL; slave = (slave_t *)bond; read_lock(&bond->ptrlock); @@ -2257,8 +2257,6 @@ read_unlock(&bond->ptrlock); while ((slave = slave->prev) != (slave_t *)bond) { - /* use updelay+1 to match an UP slave even when updelay is 0 */ - int mindelay = updelay + 1; struct net_device *dev = slave->dev; int link_state; u16 old_speed = slave->speed; @@ -2269,14 +2267,7 @@ switch (slave->link) { case BOND_LINK_UP: /* the link was up */ if (link_state == BMSR_LSTATUS) { - /* link stays up, tell that this one - is immediately available */ - if (IS_UP(dev) && (mindelay > -2)) { - /* -2 is the best case : - this slave was already up */ - mindelay = -2; - bestslave = slave; - } + /* link stays up, nothing more to do */ break; } else { /* link going down */ @@ -2316,6 +2307,7 @@ (bond_mode == BOND_MODE_8023AD)) { bond_set_slave_inactive_flags(slave); } + printk(KERN_INFO "%s: link status definitely down " "for interface %s, disabling it", @@ -2332,12 +2324,10 @@ bond_alb_handle_link_change(bond, slave, BOND_LINK_DOWN); } - write_lock(&bond->ptrlock); - if (slave == bond->current_slave) { - /* find a new interface and be verbose */ - reselect_active_interface(bond); + if (slave == oldcurrent) { + do_failover = 1; } - write_unlock(&bond->ptrlock); + slave_died = 1; } else { slave->delay--; @@ -2352,13 +2342,6 @@ master->name, (downdelay - slave->delay) * miimon, dev->name); - - if (IS_UP(dev) && (mindelay > -1)) { - /* -1 is a good case : this slave went - down only for a short time */ - mindelay = -1; - bestslave = slave; - } } break; case BOND_LINK_DOWN: /* the link was down */ @@ -2428,26 +2411,12 @@ bond_alb_handle_link_change(bond, slave, BOND_LINK_UP); } - write_lock(&bond->ptrlock); - if ( (bond->primary_slave != NULL) - && (slave == bond->primary_slave) ) - reselect_active_interface(bond); - write_unlock(&bond->ptrlock); - } - else + if ((oldcurrent == NULL) || + (slave == bond->primary_slave)) { + do_failover = 1; + } + } else { slave->delay--; - - /* we'll also look for the mostly eligible slave */ - if (bond->primary_slave == NULL) { - if (IS_UP(dev) && (slave->delay < mindelay)) { - mindelay = slave->delay; - bestslave = slave; - } - } else if ( (IS_UP(bond->primary_slave->dev)) || - ( (!IS_UP(bond->primary_slave->dev)) && - (IS_UP(dev) && (slave->delay < mindelay)) ) ) { - mindelay = slave->delay; - bestslave = slave; } } break; @@ -2466,26 +2435,17 @@ } /* end of while */ - /* - * if there's no active interface and we discovered that one - * of the slaves could be activated earlier, so we do it. - */ - read_lock(&bond->ptrlock); - oldcurrent = bond->current_slave; - read_unlock(&bond->ptrlock); + if (do_failover) { + write_lock(&bond->ptrlock); - /* no active interface at the moment or need to bring up the primary */ - if (oldcurrent == NULL) { /* no active interface at the moment */ - if (bestslave != NULL) { /* last chance to find one ? */ - write_lock(&bond->ptrlock); - change_active_interface(bond, bestslave); - write_unlock(&bond->ptrlock); - } else if (slave_died) { - /* print this message only once a slave has just died */ + reselect_active_interface(bond); + if (oldcurrent && !bond->current_slave) { printk(KERN_INFO "%s: now running without any active interface !\n", master->name); } + + write_unlock(&bond->ptrlock); } read_unlock(&bond->lock); @@ -2503,9 +2463,10 @@ static void loadbalance_arp_monitor(struct net_device *master) { bonding_t *bond; - slave_t *slave; + slave_t *slave, *oldcurrent; int the_delta_in_ticks = arp_interval * HZ / 1000; int next_timer = jiffies + (arp_interval * HZ / 1000); + int do_failover = 0; bond = (struct bonding *) master->priv; if (master->priv == NULL) { @@ -2529,6 +2490,10 @@ read_lock(&bond->lock); + read_lock(&bond->ptrlock); + oldcurrent = bond->current_slave; + read_unlock(&bond->ptrlock); + /* see if any of the previous devices are up now (i.e. they have * xmt and rcv traffic). the current_slave does not come into * the picture unless it is null. also, slave->jiffies is not needed @@ -2555,21 +2520,19 @@ * current_slave being null after enslaving * is closed. */ - write_lock(&bond->ptrlock); - if (bond->current_slave == NULL) { + if (oldcurrent == NULL) { printk(KERN_INFO "%s: link status definitely up " "for interface %s, ", master->name, slave->dev->name); - reselect_active_interface(bond); + do_failover = 1; } else { printk(KERN_INFO "%s: interface %s is now up\n", master->name, slave->dev->name); } - write_unlock(&bond->ptrlock); } } else { /* slave->link == BOND_LINK_UP */ @@ -2592,11 +2555,9 @@ master->name, slave->dev->name); - write_lock(&bond->ptrlock); - if (slave == bond->current_slave) { - reselect_active_interface(bond); + if (slave == oldcurrent) { + do_failover = 1; } - write_unlock(&bond->ptrlock); } } @@ -2610,6 +2571,19 @@ if (IS_UP(slave->dev)) { arp_send_all(slave); } + } + + if (do_failover) { + write_lock(&bond->ptrlock); + + reselect_active_interface(bond); + if (oldcurrent && !bond->current_slave) { + printk(KERN_INFO + "%s: now running without any active interface !\n", + master->name); + } + + write_unlock(&bond->ptrlock); } read_unlock(&bond->lock); diff -Nru a/drivers/net/ethertap.c b/drivers/net/ethertap.c --- a/drivers/net/ethertap.c Fri Oct 31 14:10:53 2003 +++ b/drivers/net/ethertap.c Fri Oct 31 14:10:53 2003 @@ -302,11 +302,12 @@ static void ethertap_rx(struct sock *sk, int len) { - struct net_device *dev = tap_map[sk->sk_protocol]; + unsigned unit = sk->sk_protocol - NETLINK_TAPBASE; + struct net_device *dev; struct sk_buff *skb; - if (dev==NULL) { - printk(KERN_CRIT "ethertap: bad unit!\n"); + if (unit >= max_taps || (dev = tap_map[unit]) == NULL) { + printk(KERN_CRIT "ethertap: bad unit %u!\n", unit); skb_queue_purge(&sk->sk_receive_queue); return; } diff -Nru a/drivers/net/pcmcia/fmvj18x_cs.c b/drivers/net/pcmcia/fmvj18x_cs.c --- a/drivers/net/pcmcia/fmvj18x_cs.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/pcmcia/fmvj18x_cs.c Fri Oct 31 14:10:54 2003 @@ -502,6 +502,8 @@ } if (link->io.NumPorts2 != 0) { + link->irq.Attributes = + IRQ_TYPE_DYNAMIC_SHARING|IRQ_FIRST_SHARED|IRQ_HANDLE_PRESENT; ret = mfc_try_io_port(link); if (ret != CS_SUCCESS) goto cs_failed; } else if (cardtype == UNGERMANN) { diff -Nru a/drivers/net/pcmcia/ibmtr_cs.c b/drivers/net/pcmcia/ibmtr_cs.c --- a/drivers/net/pcmcia/ibmtr_cs.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/pcmcia/ibmtr_cs.c Fri Oct 31 14:10:54 2003 @@ -136,7 +136,7 @@ struct net_device *dev; dev_node_t node; window_handle_t sram_win_handle; - struct tok_info ti; + struct tok_info *ti; } ibmtr_dev_t; static void netdev_get_drvinfo(struct net_device *dev, @@ -168,13 +168,18 @@ DEBUG(0, "ibmtr_attach()\n"); /* Create new token-ring device */ - dev = alloc_trdev(sizeof(*info)); - if (!dev) - return NULL; - info = dev->priv; + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) return NULL; + memset(info,0,sizeof(*info)); + dev = alloc_trdev(sizeof(struct tok_info)); + if (!dev) { + kfree(info); + return NULL; + } link = &info->link; link->priv = info; + info->ti = dev->priv; link->io.Attributes1 = IO_DATA_PATH_WIDTH_8; link->io.NumPorts1 = 4; @@ -265,6 +270,7 @@ *linkp = link->next; unregister_netdev(dev); free_netdev(dev); + kfree(info); } /* ibmtr_detach */ /*====================================================================== diff -Nru a/drivers/net/r8169.c b/drivers/net/r8169.c --- a/drivers/net/r8169.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/r8169.c Fri Oct 31 14:10:54 2003 @@ -292,6 +292,7 @@ MODULE_AUTHOR("Realtek"); MODULE_DESCRIPTION("RealTek RTL-8169 Gigabit Ethernet driver"); MODULE_PARM(media, "1-" __MODULE_STRING(MAX_UNITS) "i"); +MODULE_LICENSE("GPL"); static int rtl8169_open(struct net_device *dev); static int rtl8169_start_xmit(struct sk_buff *skb, struct net_device *dev); diff -Nru a/drivers/net/sis900.c b/drivers/net/sis900.c --- a/drivers/net/sis900.c Fri Oct 31 14:10:53 2003 +++ b/drivers/net/sis900.c Fri Oct 31 14:10:53 2003 @@ -1438,7 +1438,7 @@ pci_unmap_single(sis_priv->pci_dev, sis_priv->tx_ring[i].bufptr, skb->len, PCI_DMA_TODEVICE); - dev_kfree_skb(skb); + dev_kfree_skb_irq(skb); sis_priv->tx_skbuff[i] = 0; sis_priv->tx_ring[i].cmdsts = 0; sis_priv->tx_ring[i].bufptr = 0; diff -Nru a/drivers/net/starfire.c b/drivers/net/starfire.c --- a/drivers/net/starfire.c Fri Oct 31 14:10:53 2003 +++ b/drivers/net/starfire.c Fri Oct 31 14:10:53 2003 @@ -139,6 +139,7 @@ #include <linux/config.h> #include <linux/version.h> #include <linux/module.h> +#include <asm/io.h> #include <linux/kernel.h> #include <linux/pci.h> #include <linux/netdevice.h> @@ -1174,15 +1175,9 @@ TX_DESC_SPACING | TX_DESC_TYPE, ioaddr + TxDescCtrl); -#if defined(ADDR_64BITS) - writel(np->queue_mem_dma >> 32, ioaddr + RxDescQHiAddr); - writel(np->queue_mem_dma >> 32, ioaddr + TxRingHiAddr); - writel(np->queue_mem_dma >> 32, ioaddr + CompletionHiAddr); -#else - writel(0, ioaddr + RxDescQHiAddr); - writel(0, ioaddr + TxRingHiAddr); - writel(0, ioaddr + CompletionHiAddr); -#endif + writel( (np->queue_mem_dma >> 16) >> 16, ioaddr + RxDescQHiAddr); + writel( (np->queue_mem_dma >> 16) >> 16, ioaddr + TxRingHiAddr); + writel( (np->queue_mem_dma >> 16) >> 16, ioaddr + CompletionHiAddr); writel(np->rx_ring_dma, ioaddr + RxDescQAddr); writel(np->tx_ring_dma, ioaddr + TxRingPtr); diff -Nru a/drivers/net/tokenring/ibmtr.c b/drivers/net/tokenring/ibmtr.c --- a/drivers/net/tokenring/ibmtr.c Fri Oct 31 14:10:54 2003 +++ b/drivers/net/tokenring/ibmtr.c Fri Oct 31 14:10:54 2003 @@ -152,7 +152,7 @@ /* this allows displaying full adapter information */ -char *channel_def[] __initdata = { "ISA", "MCA", "ISA P&P" }; +char *channel_def[] __devinitdata = { "ISA", "MCA", "ISA P&P" }; static char pcchannelid[] __devinitdata = { 0x05, 0x00, 0x04, 0x09, @@ -864,7 +864,8 @@ ti->sram_virt &= ~1; /* to reverse what we do in tok_close */ /* init the spinlock */ ti->lock = (spinlock_t) SPIN_LOCK_UNLOCKED; - + init_timer(&ti->tr_timer); + i = tok_init_card(dev); if (i) return i; @@ -1033,7 +1034,7 @@ /* Important for PCMCIA hot unplug, otherwise, we'll pull the card, */ /* unloading the module from memory, and then if a timer pops, ouch */ - del_timer(&ti->tr_timer); + del_timer_sync(&ti->tr_timer); outb(0, dev->base_addr + ADAPTRESET); ti->sram_virt |= 1; ti->open_status = CLOSED; diff -Nru a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c --- a/drivers/net/wireless/airo.c Fri Oct 31 14:10:53 2003 +++ b/drivers/net/wireless/airo.c Fri Oct 31 14:10:53 2003 @@ -982,6 +982,7 @@ static int airo_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); #ifdef WIRELESS_EXT struct iw_statistics *airo_get_wireless_stats (struct net_device *dev); +static void airo_read_wireless_stats (struct airo_info *local); #endif /* WIRELESS_EXT */ #ifdef CISCO_EXT static int readrids(struct net_device *dev, aironet_ioctl *comp); @@ -1027,7 +1028,7 @@ #define FLAG_PENDING_XMIT 9 #define FLAG_PENDING_XMIT11 10 #define FLAG_PCI 11 -#define JOB_MASK 0xff0000 +#define JOB_MASK 0x1ff0000 #define JOB_DIE 16 #define JOB_XMIT 17 #define JOB_XMIT11 18 @@ -1036,6 +1037,7 @@ #define JOB_MIC 21 #define JOB_EVENT 22 #define JOB_AUTOWEP 23 +#define JOB_WSTATS 24 int (*bap_read)(struct airo_info*, u16 *pu16Dst, int bytelen, int whichbap); unsigned short *flash; @@ -1692,8 +1694,8 @@ return PC4500_writerid( ai, RID_CONFIG, &cfgr, sizeof(cfgr), lock); } -static int readStatusRid(struct airo_info*ai, StatusRid *statr) { - int rc = PC4500_readrid(ai, RID_STATUS, statr, sizeof(*statr), 1); +static int readStatusRid(struct airo_info*ai, StatusRid *statr, int lock) { + int rc = PC4500_readrid(ai, RID_STATUS, statr, sizeof(*statr), lock); u16 *s; statr->len = le16_to_cpu(statr->len); @@ -2415,6 +2417,8 @@ airo_end_xmit11(dev); else if (test_bit(JOB_STATS, &ai->flags)) airo_read_stats(ai); + else if (test_bit(JOB_WSTATS, &ai->flags)) + airo_read_wireless_stats(ai); else if (test_bit(JOB_PROMISC, &ai->flags)) airo_set_promisc(ai); #ifdef MICSUPPORT @@ -2944,7 +2948,6 @@ ai->config.opmode = adhoc ? MODE_STA_IBSS : MODE_STA_ESS; ai->config.authType = AUTH_OPEN; ai->config.modulation = MOD_CCK; - ai->config._reserved1a[0] = 2; /* ??? */ #ifdef MICSUPPORT if ((cap_rid.len>=sizeof(cap_rid)) && (cap_rid.extSoftCap&1) && @@ -3723,7 +3726,7 @@ return -ENOMEM; } - readStatusRid(apriv, &status_rid); + readStatusRid(apriv, &status_rid, 1); readCapabilityRid(apriv, &cap_rid); i = sprintf(data->rbuffer, "Status: %s%s%s%s%s%s%s%s%s\n", @@ -4767,7 +4770,7 @@ if ((local->config.opmode & 0xFF) == MODE_STA_ESS) status_rid.channel = local->config.channelSet; else - readStatusRid(local, &status_rid); + readStatusRid(local, &status_rid, 1); #ifdef WEXT_USECHANNELS fwrq->m = ((int)status_rid.channel) + 1; @@ -4842,7 +4845,7 @@ struct airo_info *local = dev->priv; StatusRid status_rid; /* Card status info */ - readStatusRid(local, &status_rid); + readStatusRid(local, &status_rid, 1); /* Note : if dwrq->flags != 0, we should * get the relevant SSID from the SSID list... */ @@ -4906,7 +4909,7 @@ struct airo_info *local = dev->priv; StatusRid status_rid; /* Card status info */ - readStatusRid(local, &status_rid); + readStatusRid(local, &status_rid, 1); /* Tentative. This seems to work, wow, I'm lucky !!! */ memcpy(awrq->sa_data, status_rid.bssid[0], ETH_ALEN); @@ -5039,7 +5042,7 @@ struct airo_info *local = dev->priv; StatusRid status_rid; /* Card status info */ - readStatusRid(local, &status_rid); + readStatusRid(local, &status_rid, 1); vwrq->value = status_rid.currentXmitRate * 500000; /* If more than one rate, set auto */ @@ -5755,7 +5758,7 @@ } if (!i) { StatusRid status_rid; /* Card status info */ - readStatusRid(local, &status_rid); + readStatusRid(local, &status_rid, 1); for (i = 0; i < min(IW_MAX_AP, 4) && (status_rid.bssid[i][0] @@ -6562,16 +6565,17 @@ * * Jean */ -struct iw_statistics *airo_get_wireless_stats(struct net_device *dev) +static void airo_read_wireless_stats(struct airo_info *local) { - struct airo_info *local = dev->priv; StatusRid status_rid; StatsRid stats_rid; u32 *vals = stats_rid.vals; /* Get stats out of the card */ - readStatusRid(local, &status_rid); - readStatsRid(local, &stats_rid, RID_STATS, 1); + clear_bit(JOB_WSTATS, &local->flags); + readStatusRid(local, &status_rid, 0); + readStatsRid(local, &stats_rid, RID_STATS, 0); + up(&local->sem); /* The status */ local->wstats.status = status_rid.mode; @@ -6598,6 +6602,19 @@ local->wstats.discard.retries = vals[10]; local->wstats.discard.misc = vals[1] + vals[32]; local->wstats.miss.beacon = vals[34]; +} + +struct iw_statistics *airo_get_wireless_stats(struct net_device *dev) +{ + struct airo_info *local = dev->priv; + + /* Get stats out of the card if available */ + if (down_trylock(&local->sem) != 0) { + set_bit(JOB_WSTATS, &local->flags); + wake_up_interruptible(&local->thr_wait); + } else + airo_read_wireless_stats(local); + return &local->wstats; } #endif /* WIRELESS_EXT */ diff -Nru a/drivers/pci/quirks.c b/drivers/pci/quirks.c --- a/drivers/pci/quirks.c Fri Oct 31 14:10:53 2003 +++ b/drivers/pci/quirks.c Fri Oct 31 14:10:53 2003 @@ -750,13 +750,35 @@ * bridges pretend to be 85C503/5513 instead. In that case see if we * spotted a compatible north bridge to make sure. * (pci_find_device doesn't work yet) + * + * We can also enable the sis96x bit in the discovery register.. */ static int __devinitdata sis_96x_compatible = 0; +#define SIS_DETECT_REGISTER 0x40 + static void __init quirk_sis_503_smbus(struct pci_dev *dev) { - if (sis_96x_compatible) - quirk_sis_96x_smbus(dev); + u8 reg; + u16 devid; + + pci_read_config_byte(dev, SIS_DETECT_REGISTER, ®); + pci_write_config_byte(dev, SIS_DETECT_REGISTER, reg | (1 << 6)); + pci_read_config_word(dev, PCI_DEVICE_ID, &devid); + if ((devid & 0xfff0) != 0x0960) { + pci_write_config_byte(dev, SIS_DETECT_REGISTER, reg); + return; + } + + /* Make people aware that we changed the config.. */ + printk(KERN_WARNING "Uncovering SIS%x that hid as a SIS503 (compatible=%d)\n", devid, sis_96x_compatible); + + /* + * Ok, it now shows up as a 96x.. The 96x quirks are after + * the 503 quirk in the quirk table, so they'll automatically + * run and enable things like the SMBus device + */ + dev->device = devid; } static void __init quirk_sis_96x_compatible(struct pci_dev *dev) diff -Nru a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c --- a/drivers/pci/setup-bus.c Fri Oct 31 14:10:54 2003 +++ b/drivers/pci/setup-bus.c Fri Oct 31 14:10:54 2003 @@ -132,13 +132,19 @@ PCI-to-PCI Bridge Architecture Specification rev. 1.1 (1998) requires that if there is no I/O ports or memory behind the bridge, corresponding range must be turned off by writing base - value greater than limit to the bridge's base/limit registers. */ + value greater than limit to the bridge's base/limit registers. + + Note: care must be taken when updating I/O base/limit registers + of bridges which support 32-bit I/O. This update requires two + config space writes, so it's quite possible that an I/O window of + the bridge will have some undesirable address (e.g. 0) after the + first write. Ditto 64-bit prefetchable MMIO. */ static void __devinit pci_setup_bridge(struct pci_bus *bus) { struct pci_dev *bridge = bus->self; struct pci_bus_region region; - u32 l; + u32 l, io_upper16; DBGC((KERN_INFO "PCI: Bus %d, bridge: %s\n", bus->number, pci_name(bridge))); @@ -151,20 +157,22 @@ l |= (region.start >> 8) & 0x00f0; l |= region.end & 0xf000; /* Set up upper 16 bits of I/O base/limit. */ - pci_write_config_word(bridge, PCI_IO_BASE_UPPER16, - region.start >> 16); - pci_write_config_word(bridge, PCI_IO_LIMIT_UPPER16, - region.end >> 16); + io_upper16 = (region.end & 0xffff0000) | (region.start >> 16); DBGC((KERN_INFO " IO window: %04lx-%04lx\n", region.start, region.end)); } else { /* Clear upper 16 bits of I/O base/limit. */ - pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, 0); + io_upper16 = 0; l = 0x00f0; DBGC((KERN_INFO " IO window: disabled.\n")); } + /* Temporarily disable the I/O range before updating PCI_IO_BASE. */ + pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, 0x0000ffff); + /* Update lower 16 bits of I/O base/limit. */ pci_write_config_dword(bridge, PCI_IO_BASE, l); + /* Update upper 16 bits of I/O base/limit. */ + pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, io_upper16); /* Set up the top and bottom of the PCI Memory segment for this bus. */ @@ -181,8 +189,9 @@ } pci_write_config_dword(bridge, PCI_MEMORY_BASE, l); - /* Clear out the upper 32 bits of PREF base/limit. */ - pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, 0); + /* Clear out the upper 32 bits of PREF limit. + If PCI_PREF_BASE_UPPER32 was non-zero, this temporarily + disables PREF range, which is ok. */ pci_write_config_dword(bridge, PCI_PREF_LIMIT_UPPER32, 0); /* Set up PREF base/limit. */ @@ -198,6 +207,9 @@ DBGC((KERN_INFO " PREFETCH window: disabled.\n")); } pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, l); + + /* Clear out the upper 32 bits of PREF base. */ + pci_write_config_dword(bridge, PCI_PREF_BASE_UPPER32, 0); /* Check if we have VGA behind the bridge. Enable ISA in either case (FIXME!). */ diff -Nru a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c --- a/drivers/pcmcia/yenta_socket.c Fri Oct 31 14:10:54 2003 +++ b/drivers/pcmcia/yenta_socket.c Fri Oct 31 14:10:54 2003 @@ -461,6 +461,7 @@ static int yenta_sock_init(struct pcmcia_socket *sock) { struct yenta_socket *socket = container_of(sock, struct yenta_socket, socket); + u32 state; u16 bridge; bridge = config_readw(socket, CB_BRIDGE_CONTROL) & ~CB_BRIDGE_INTR; @@ -472,7 +473,10 @@ exca_writeb(socket, I365_GENCTL, 0x00); /* Redo card voltage interrogation */ - cb_writel(socket, CB_SOCKET_FORCE, CB_CVSTEST); + state = cb_readl(socket, CB_SOCKET_STATE); + if (!(state & (CB_CDETECT1 | CB_CDETECT2 | CB_5VCARD | + CB_3VCARD | CB_XVCARD | CB_YVCARD))) + cb_writel(socket, CB_SOCKET_FORCE, CB_CVSTEST); yenta_clear_maps(socket); diff -Nru a/drivers/pnp/isapnp/core.c b/drivers/pnp/isapnp/core.c --- a/drivers/pnp/isapnp/core.c Fri Oct 31 14:10:54 2003 +++ b/drivers/pnp/isapnp/core.c Fri Oct 31 14:10:54 2003 @@ -1160,7 +1160,7 @@ return 0; } -device_initcall(isapnp_init); +fs_initcall(isapnp_init); /* format is: noisapnp */ diff -Nru a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c --- a/drivers/serial/serial_core.c Fri Oct 31 14:10:53 2003 +++ b/drivers/serial/serial_core.c Fri Oct 31 14:10:53 2003 @@ -1707,6 +1707,9 @@ strcat(stat_buf, "\n"); ret += sprintf(buf + ret, stat_buf); + } else { + strcat(buf, "\n"); + ret++; } #undef STATBIT #undef INFOBIT diff -Nru a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig --- a/drivers/usb/serial/Kconfig Fri Oct 31 14:10:54 2003 +++ b/drivers/usb/serial/Kconfig Fri Oct 31 14:10:54 2003 @@ -73,7 +73,7 @@ config USB_SERIAL_WHITEHEAT tristate "USB ConnectTech WhiteHEAT Serial Driver" - depends on USB_SERIAL + depends on USB_SERIAL && BROKEN_ON_SMP help Say Y here if you want to use a ConnectTech WhiteHEAT 4 port USB to serial converter device. diff -Nru a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c --- a/drivers/usb/serial/digi_acceleport.c Fri Oct 31 14:10:53 2003 +++ b/drivers/usb/serial/digi_acceleport.c Fri Oct 31 14:10:53 2003 @@ -444,7 +444,7 @@ /* Local Function Declarations */ static void digi_wakeup_write( struct usb_serial_port *port ); -static void digi_wakeup_write_lock( struct usb_serial_port *port ); +static void digi_wakeup_write_lock(void *); static int digi_write_oob_command( struct usb_serial_port *port, unsigned char *buf, int count, int interruptible ); static int digi_write_inb_command( struct usb_serial_port *port, @@ -608,9 +608,9 @@ * on writes. */ -static void digi_wakeup_write_lock( struct usb_serial_port *port ) +static void digi_wakeup_write_lock(void *arg) { - + struct usb_serial_port *port = arg; unsigned long flags; struct digi_port *priv = usb_get_serial_port_data(port); diff -Nru a/fs/Kconfig b/fs/Kconfig --- a/fs/Kconfig Fri Oct 31 14:10:54 2003 +++ b/fs/Kconfig Fri Oct 31 14:10:54 2003 @@ -193,6 +193,212 @@ default y if EXT2_FS=y || EXT3_FS=y default m if EXT2_FS=m || EXT3_FS=m +config REISER4_FS + bool "Reiser4 (EXPERIMENTAL very fast general purpose filesystem)" + depends on EXPERIMENTAL + ---help--- + Reiser4 is more than twice as fast for both reads and writes as + ReiserFS. That means it is four times as fast as NTFS by Microsoft. + (Proper benchmarks will appear in a few months at + www.namesys.com/benchmarks.html, please be patient for now). + + It is the storage layer of what will become a general purpose naming + system --- like what Microsoft wants OFS to be except designed with a + clean new semantic layer rather than being SQL based like OFS. + + It performs all filesystem operations as atomic transactions, which + means that it either performs a write, or it does not, and in the + event of a crash it does not partially perform it or corrupt it. + + It stores files in dancing trees, which are like balanced trees but + faster. It packs small files together so that they share blocks + without wasting space. This means you can use it to store really + small files. It also means that it saves you disk space. It avoids + hassling you with anachronisms like having a maximum number of + inodes, and wasting space if you use less than that number. + + It can handle really large directories, because its search + algorithms are logarithmic with size not linear. With Reiser4 you + should use subdirectories because they help YOU, not because they + help your filesystem's performance, or because your filesystem won't + be able to shrink a directory once you have let it grow. For squid + and similar applications, everything in one directory should perform + better. + + It has a plugin-based infrastructure, which means that you can easily + invent new kinds of files, and so can other people, so it will evolve + rapidly. + + We will be adding a variety of security features to it that DARPA has + funded us to write. + + "reiser4" is a distinct filesystem mount type from "reiserfs" (V3), + which means that "reiserfs" filesystems will be unaffected by any + reiser4 bugs. They have no code in common. Reiser4 is a complete + rewrite from scratch fully incorporating what we learned by experience + while doing "reiserfs" the first time. That was a lot.;-) + + Reiser4 is about as stable as the usual tornado for now --- it is + for use by developers and testers only. We don't use it for our web + server --- you should not either. This will change before 2.6.0. + ReiserFS V3 is the right choice for those who want a filesystem so + stable that we can go for months now without any bug reports while we + have millions of users. + + If you'd like to upgrade from reiserfs to reiser4, use tar to a + temporary disk, maybe using NFS/ssh/SFS to get to that disk, or ask + your favorite distro to sponsor writing a conversion program. + + Sponsored by the Defensed Advanced Research Projects Agency (DARPA) + of the United States Government. DARPA does not endorse this + project, it merely sponsors it. + See http://www.darpa.mil/ato/programs/chats.htm + + To learn more about reiser4, go to http://www.namesys.com + +config REISER4_FS_SYSCALL + bool +# bool "Enable reiser4 system call" + default n + depends on REISER4_FS + ---help--- + Adds sys_reiser4() syscall. + This code is not in good shape yet and may not compile and stuff like that. + +config REISER4_LARGE_KEY + bool "Use larger keys on reiser4 tree" + depends on REISER4_FS + default y + ---help--- + Make keys larger and use additional bits to order bodies of files within + a directory in the order of their names, which is what you want + normally. If you turn this off, file bodies will be ordered by creation + time, which is not optimal for most users. + + Warning: flipping this option makes your file system binary + incompatible. + +config REISER4_CHECK + bool "Enable reiser4 debug options" + depends on REISER4_FS + ---help--- + Don't use this unless you are a developer debugging reiser4. If + using a kernel made by a distro that thinks they are our competitor + (sigh) rather than made by Linus, always check each release to make + sure they have not turned this on to make us look slow as was done + once in the past. This checks everything imaginable while reiser4 + runs. + + When adding features to reiser4 you should set this, and then + extensively test the code, and then send to us and we will test it + again. Include a description of what you did to test it. All + reiser4 code must be tested, reviewed, and signed off on by two + persons before it will be accepted into a stable kernel by Hans. + +config REISER4_DEBUG + bool "Assertions" + depends on REISER4_CHECK + help + Turns on assertions checks. Eats a lot of CPU. + +config REISER4_FS_SYSCALL_DEBUG + bool "Enable reiser4 system call debug" + depends on REISER4_CHECK + help + Turns on debug reiser4_system_call. + +config REISER4_DEBUG_MODIFY + bool "Dirtying" + depends on REISER4_CHECK + help + Check that node is marked dirty each time it's modified. This is done + through maintaining checksum of node content. CPU hog. + +config REISER4_DEBUG_MEMCPY + bool "Memory copying" + depends on REISER4_CHECK + help + Use special non-inlined versions on memcpy, memset, and memmove in + reiser4 to estimate amount of CPU time spent in data copying. + +config REISER4_DEBUG_NODE + bool "Node consistency" + depends on REISER4_CHECK + help + Run consistency checks on nodes in balanced tree. CPU hog. + +config REISER4_ZERO_NEW_NODE + bool "Node zeroing" + depends on REISER4_CHECK + help + Zero new node before use. + +config REISER4_TRACE + bool "Tracing" + depends on REISER4_CHECK + help + Turn on tracing facility. This enables trace_flags mount option. + +config REISER4_EVENT_LOG + bool "Log events" + depends on REISER4_CHECK + help + Log events into user supplied file. This enables trace_file mount option. + +config REISER4_STATS + bool "Statistics" + depends on REISER4_CHECK + help + Turn on statistics collection. This increases size of in-memory super + block considerably. + +config REISER4_PROF + bool "Profiling" + depends on REISER4_CHECK + help + Turn on collection of profiling information available through sysfs. + +config REISER4_LOCKPROF + bool "Lock Profiling" + depends on REISER4_CHECK && PROFILING + help + Turn on collection of spin lock contention information. + +config REISER4_DEBUG_OUTPUT + bool "Printing" + depends on REISER4_CHECK + help + Enable compilation of functions that print internal kernel data + structures in human readable form. Useful for debugging. + +config REISER4_NOOPT + bool "Disable optimization" + depends on REISER4_CHECK + help + Disable compiler optimizations for reiser4 code. + +config REISER4_USE_EFLUSH +# bool "Enable emergency flush" + bool + default y + depends on REISER4_FS + help + Say Y unless you know what you are doing. Details are in reiser4/emergency_flush.c + +config REISER4_COPY_ON_CAPTURE + bool "Enable copy on capture" + depends on REISER4_FS + help + Say N unless you know what you are doing. This is under development + +config REISER4_BADBLOCKS + bool "Enable handling of badblocks in system areas" + depends on REISER4_FS + help + This allows you to use filesystems with badblocks in static reiser4 system areas + (such as superblock, bitmaps, journal header/footer). This imposes some performance + penalty, so say N unless you have such a filesystem. + config REISERFS_FS tristate "Reiserfs support" help diff -Nru a/fs/Kconfig~reiser4-fs-Kconfig.diff b/fs/Kconfig~reiser4-fs-Kconfig.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/Kconfig~reiser4-fs-Kconfig.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,1616 @@ +# +# File system configuration +# + +menu "File systems" + +config EXT2_FS + tristate "Second extended fs support" + help + This is the de facto standard Linux file system (method to organize + files on a storage device) for hard disks. + + You want to say Y here, unless you intend to use Linux exclusively + from inside a DOS partition using the UMSDOS file system. The + advantage of the latter is that you can get away without + repartitioning your hard drive (which often implies backing + everything up and restoring afterwards); the disadvantage is that + Linux becomes susceptible to DOS viruses and that UMSDOS is somewhat + slower than ext2fs. Even if you want to run Linux in this fashion, + it might be a good idea to have ext2fs around: it enables you to + read more floppy disks and facilitates the transition to a *real* + Linux partition later. Another (rare) case which doesn't require + ext2fs is a diskless Linux box which mounts all files over the + network using NFS (in this case it's sufficient to say Y to "NFS + file system support" below). Saying Y here will enlarge your kernel + by about 44 KB. + + The Ext2fs-Undeletion mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>, gives information about + how to retrieve deleted files on ext2fs file systems. + + To change the behavior of ext2 file systems, you can use the tune2fs + utility ("man tune2fs"). To modify attributes of files and + directories on ext2 file systems, use chattr ("man chattr"). + + Ext2fs partitions can be read from within DOS using the ext2tool + command line tool package (available from + <ftp://ibiblio.org/pub/Linux/system/filesystems/ext2/>) and from + within Windows NT using the ext2nt command line tool package from + <ftp://ibiblio.org/pub/Linux/utils/dos/>. Explore2fs is a + graphical explorer for ext2fs partitions which runs on Windows 95 + and Windows NT and includes experimental write support; it is + available from + <http://jnewbigin-pc.it.swin.edu.au/Linux/Explore2fs.htm>. + + To compile this file system support as a module, choose M here: the + module will be called ext2. Be aware however that the file system + of your root partition (the one containing the directory /) cannot + be compiled as a module, and so this could be dangerous. Most + everyone wants to say Y here. + +config EXT2_FS_XATTR + bool "Ext2 extended attributes" + depends on EXT2_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config EXT2_FS_POSIX_ACL + bool "Ext2 POSIX Access Control Lists" + depends on EXT2_FS_XATTR + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT2_FS_SECURITY + bool "Ext2 Security Labels" + depends on EXT2_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext2 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config EXT3_FS + tristate "Ext3 journalling file system support" + help + This is the journaling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journaling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the file system, the on-disk format + of ext3 is identical to ext2. It is possible to freely switch + between using the ext3 driver and the ext2 driver, as long as the + file system has been cleanly unmounted, or e2fsck is run on the file + system. + + To add a journal on an existing ext2 file system or change the + behavior of ext3 file systems, you can use the tune2fs utility ("man + tune2fs"). To modify attributes of files and directories on ext3 + file systems, use chattr ("man chattr"). You need to be using + e2fsprogs version 1.20 or later in order to create ext3 journals + (available at <http://sourceforge.net/projects/e2fsprogs/>). + + To compile this file system support as a module, choose M here: the + module will be called ext3. Be aware however that the file system + of your root partition (the one containing the directory /) cannot + be compiled as a module, and so this may be dangerous. + +config EXT3_FS_XATTR + bool "Ext3 extended attributes" + depends on EXT3_FS + default y + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + + You need this for POSIX ACL support on ext3. + +config EXT3_FS_POSIX_ACL + bool "Ext3 POSIX Access Control Lists" + depends on EXT3_FS_XATTR + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config EXT3_FS_SECURITY + bool "Ext3 Security Labels" + depends on EXT3_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ext3 filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JBD +# CONFIG_JBD could be its own option (even modular), but until there are +# other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS +# dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS + tristate + default EXT3_FS + help + This is a generic journaling layer for block devices. It is + currently used by the ext3 file system, but it could also be used to + add journal support to other file systems or block devices such as + RAID or LVM. + + If you are using the ext3 file system, you need to say Y here. If + you are not using ext3 then you will probably want to say N. + + To compile this device as a module, choose M here: the module will be + called jbd. If you are compiling ext3 into the kernel, you cannot + compile this code as a module. + +config JBD_DEBUG + bool "JBD (ext3) debugging support" + depends on JBD + help + If you are using the ext3 journaled file system (or potentially any + other file system/device using JBD), this option allows you to + enable debugging output while the system is running, in order to + help track down any problems you are having. By default the + debugging output will be turned off. + + If you select Y here, then you will be able to turn on debugging + with "echo N > /proc/sys/fs/jbd-debug", where N is a number between + 1 and 5, the higher the number, the more debugging output is + generated. To turn debugging off again, do + "echo 0 > /proc/sys/fs/jbd-debug". + +config FS_MBCACHE +# Meta block cache for Extended Attributes (ext2/ext3) + tristate + depends on EXT2_FS_XATTR || EXT3_FS_XATTR + default y if EXT2_FS=y || EXT3_FS=y + default m if EXT2_FS=m || EXT3_FS=m + +config REISERFS_FS + tristate "Reiserfs support" + help + Stores not just filenames but the files themselves in a balanced + tree. Uses journaling. + + Balanced trees are more efficient than traditional file system + architectural foundations. + + In general, ReiserFS is as fast as ext2, but is very efficient with + large directories and small files. Additional patches are needed + for NFS and quotas, please see <http://www.reiserfs.org/> for links. + + It is more easily extended to have features currently found in + database and keyword search systems than block allocation based file + systems are. The next version will be so extended, and will support + plugins consistent with our motto ``It takes more than a license to + make source code open.'' + + Read <http://www.reiserfs.org/> to learn more about reiserfs. + + Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. + + If you like it, you can pay us to add new features to it that you + need, buy a support contract, or pay us to port it to another OS. + +config REISERFS_CHECK + bool "Enable reiserfs debug mode" + depends on REISERFS_FS + help + If you set this to Y, then ReiserFS will perform every check it can + possibly imagine of its internal consistency throughout its + operation. It will also go substantially slower. More than once we + have forgotten that this was on, and then gone despondent over the + latest benchmarks.:-) Use of this option allows our team to go all + out in checking for consistency when debugging without fear of its + effect on end users. If you are on the verge of sending in a bug + report, say Y and you might get a useful error message. Almost + everyone should say N. + +config REISERFS_PROC_INFO + bool "Stats in /proc/fs/reiserfs" + depends on REISERFS_FS + help + Create under /proc/fs/reiserfs a hierarchy of files, displaying + various ReiserFS statistics and internal data at the expense of + making your kernel or module slightly larger (+8 KB). This also + increases the amount of kernel memory required for each mount. + Almost everyone but ReiserFS developers and people fine-tuning + reiserfs or tracing problems should say N. + +config JFS_FS + tristate "JFS filesystem support" + help + This is a port of IBM's Journaled Filesystem . More information is + available in the file Documentation/filesystems/jfs.txt. + + If you do not intend to use the JFS filesystem, say N. + +config JFS_POSIX_ACL + bool "JFS POSIX Access Control Lists" + depends on JFS_FS + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config JFS_DEBUG + bool "JFS debugging" + depends on JFS_FS + help + If you are experiencing any problems with the JFS filesystem, say + Y here. This will result in additional debugging messages to be + written to the system log. Under normal circumstances, this + results in very little overhead. + +config JFS_STATISTICS + bool "JFS statistics" + depends on JFS_FS + help + Enabling this option will cause statistics from the JFS file system + to be made available to the user in the /proc/fs/jfs/ directory. + +config FS_POSIX_ACL +# Posix ACL utility routines (for now, only ext2/ext3/jfs) +# +# NOTE: you can implement Posix ACLs without these helpers (XFS does). +# Never use this symbol for ifdefs. +# + bool + depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL + default y + +config XFS_FS + tristate "XFS filesystem support" + help + XFS is a high performance journaling filesystem which originated + on the SGI IRIX platform. It is completely multi-threaded, can + support large files and large filesystems, extended attributes, + variable block sizes, is extent based, and makes extensive use of + Btrees (directories, extents, free space) to aid both performance + and scalability. + + Refer to the documentation at <http://oss.sgi.com/projects/xfs/> + for complete details. This implementation is on-disk compatible + with the IRIX version of XFS. + + To compile this file system support as a module, choose M here: the + module will be called xfs. Be aware, however, that if the file + system of your root partition is compiled as a module, you'll need + to use an initial ramdisk (initrd) to boot. + +config XFS_RT + bool "Realtime support (EXPERIMENTAL)" + depends on XFS_FS && EXPERIMENTAL + help + If you say Y here you will be able to mount and use XFS filesystems + which contain a realtime subvolume. The realtime subvolume is a + separate area of disk space where only file data is stored. The + realtime subvolume is designed to provide very deterministic + data rates suitable for media streaming applications. + + See the xfs man page in section 5 for a bit more information. + + This feature is unsupported at this time, is not yet fully + functional, and may cause serious problems. + + If unsure, say N. + +config XFS_QUOTA + bool "Quota support" + depends on XFS_FS + help + If you say Y here, you will be able to set limits for disk usage on + a per user and/or a per group basis under XFS. XFS considers quota + information as filesystem metadata and uses journaling to provide a + higher level guarantee of consistency. The on-disk data format for + quota is also compatible with the IRIX version of XFS, allowing a + filesystem to be migrated between Linux and IRIX without any need + for conversion. + + If unsure, say N. More comprehensive documentation can be found in + README.quota in the xfsprogs package. XFS quota can be used either + with or without the generic quota support enabled (CONFIG_QUOTA) - + they are completely independent subsystems. + +config XFS_POSIX_ACL + bool "ACL support" + depends on XFS_FS + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website <http://acl.bestbits.at/>. + + If you don't know what Access Control Lists are, say N + +config MINIX_FS + tristate "Minix fs support" + help + Minix is a simple operating system used in many classes about OS's. + The minix file system (method to organize files on a hard disk + partition or a floppy disk) was the original file system for Linux, + but has been superseded by the second extended file system ext2fs. + You don't want to use the minix file system on your hard disk + because of certain built-in restrictions, but it is sometimes found + on older Linux floppy disks. This option will enlarge your kernel + by about 28 KB. If unsure, say N. + + To compile this file system support as a module, choose M here: the + module will be called minix. Note that the file system of your root + partition (the one containing the directory /) cannot be compiled as + a module. + +config ROMFS_FS + tristate "ROM file system support" + ---help--- + This is a very small read-only file system mainly intended for + initial ram disks of installation disks, but it could be used for + other read-only media as well. Read + <file:Documentation/filesystems/romfs.txt> for details. + + To compile this file system support as a module, choose M here: the + module will be called romfs. Note that the file system of your + root partition (the one containing the directory /) cannot be a + module. + + If you don't know whether you need it, then you don't need it: + answer N. + +config QUOTA + bool "Quota support" + help + If you say Y here, you will be able to set per user limits for disk + usage (also called disk quotas). Currently, it works for the + ext2, ext3, and reiserfs file system. You need additional software + in order to use quota support (you can download sources from + <http://www.sf.net/projects/linuxquota/>). For further details, read + the Quota mini-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. Probably the quota + support is only useful for multi user systems. If unsure, say N. + +config QFMT_V1 + tristate "Old quota format support" + depends on QUOTA + help + This quota format was (is) used by kernels earlier than 2.4.??. If + you have quota working and you don't want to convert to new quota + format say Y here. + +config QFMT_V2 + tristate "Quota format v2 support" + depends on QUOTA + help + This quota format allows using quotas with 32-bit UIDs/GIDs. If you + need this functionality say Y here. Note that you will need latest + quota utilities for new quota format with this kernel. + +config QUOTACTL + bool + depends on XFS_QUOTA || QUOTA + default y + +config AUTOFS_FS + tristate "Kernel automounter support" + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from the autofs + package; you can find the location in <file:Documentation/Changes>. + You also want to answer Y to "NFS file system support", below. + + If you want to use the newer version of the automounter with more + features, say N here and say Y to "Kernel automounter v4 support", + below. + + To compile this support as a module, choose M here: the module will be + called autofs. + + If you are not a part of a fairly large, distributed network, you + probably do not need an automounter, and can say N here. + +config AUTOFS4_FS + tristate "Kernel automounter version 4 support (also supports v3)" + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from + <ftp://ftp.kernel.org/pub/linux/daemons/autofs/testing-v4/>; you also + want to answer Y to "NFS file system support", below. + + To compile this support as a module, choose M here: the module will be + called autofs4. You will need to add "alias autofs autofs4" to your + modules configuration file. + + If you are not a part of a fairly large, distributed network or + don't have a laptop which needs to dynamically reconfigure to the + local network, you probably do not need an automounter, and can say + N here. + +menu "CD-ROM/DVD Filesystems" + +config ISO9660_FS + tristate "ISO 9660 CDROM file system support" + help + This is the standard file system used on CD-ROMs. It was previously + known as "High Sierra File System" and is called "hsfs" on other + Unix systems. The so-called Rock-Ridge extensions which allow for + long Unix filenames and symbolic links are also supported by this + driver. If you have a CD-ROM drive and want to do more with it than + just listen to audio CDs and watch its LEDs, say Y (and read + <file:Documentation/filesystems/isofs.txt> and the CD-ROM-HOWTO, + available from <http://www.tldp.org/docs.html#howto>), thereby + enlarging your kernel by about 27 KB; otherwise say N. + + To compile this file system support as a module, choose M here: the + module will be called isofs. + +config JOLIET + bool "Microsoft Joliet CDROM extensions" + depends on ISO9660_FS + help + Joliet is a Microsoft extension for the ISO 9660 CD-ROM file system + which allows for long filenames in unicode format (unicode is the + new 16 bit character code, successor to ASCII, which encodes the + characters of almost all languages of the world; see + <http://www.unicode.org/> for more information). Say Y here if you + want to be able to read Joliet CD-ROMs under Linux. + +config ZISOFS + bool "Transparent decompression extension" + depends on ISO9660_FS + select ZLIB_INFLATE + help + This is a Linux-specific extension to RockRidge which lets you store + data in compressed form on a CD-ROM and have it transparently + decompressed when the CD-ROM is accessed. See + <http://www.kernel.org/pub/linux/utils/fs/zisofs/> for the tools + necessary to create such a filesystem. Say Y here if you want to be + able to read such compressed CD-ROMs. + +config ZISOFS_FS +# for fs/nls/Config.in + tristate + depends on ZISOFS + default ISO9660_FS + +config UDF_FS + tristate "UDF file system support" + help + This is the new file system used on some CD-ROMs and DVDs. Say Y if + you intend to mount DVD discs or CDRW's written in packet mode, or + if written to by other UDF utilities, such as DirectCD. + Please read <file:Documentation/filesystems/udf.txt>. + + To compile this file system support as a module, choose M here: the + module will be called udf. + + If unsure, say N. + +endmenu + +menu "DOS/FAT/NT Filesystems" + +config FAT_FS + tristate "DOS FAT fs support" + help + If you want to use one of the FAT-based file systems (the MS-DOS, + VFAT (Windows 95) and UMSDOS (used to run Linux on top of an + ordinary DOS partition) file systems), then you must say Y or M here + to include FAT support. You will then be able to mount partitions or + diskettes with FAT-based file systems and transparently access the + files on them, i.e. MSDOS files will look and behave just like all + other Unix files. + + This FAT support is not a file system in itself, it only provides + the foundation for the other file systems. You will have to say Y or + M to at least one of "MSDOS fs support" or "VFAT fs support" in + order to make use of it. + + Another way to read and write MSDOS floppies and hard drive + partitions from within Linux (but not transparently) is with the + mtools ("man mtools") program suite. You don't need to say Y here in + order to do that. + + If you need to move large files on floppies between a DOS and a + Linux box, say Y here, mount the floppy under Linux with an MSDOS + file system and use GNU tar's M option. GNU tar is a program + available for Unix and DOS ("man tar" or "info tar"). + + It is now also becoming possible to read and write compressed FAT + file systems; read <file:Documentation/filesystems/fat_cvf.txt> for + details. + + The FAT support will enlarge your kernel by about 37 KB. If unsure, + say Y. + + To compile this as a module, choose M here: the module will be called + fat. Note that if you compile the FAT support as a module, you + cannot compile any of the FAT-based file systems into the kernel + -- they will have to be modules as well. + The file system of your root partition (the one containing the + directory /) cannot be a module, so don't say M here if you intend + to use UMSDOS as your root file system. + +config MSDOS_FS + tristate "MSDOS fs support" + depends on FAT_FS + help + This allows you to mount MSDOS partitions of your hard drive (unless + they are compressed; to access compressed MSDOS partitions under + Linux, you can either use the DOS emulator DOSEMU, described in the + DOSEMU-HOWTO, available from + <http://www.tldp.org/docs.html#howto>, or try dmsdosfs in + <ftp://ibiblio.org/pub/Linux/system/filesystems/dosfs/>. If you + intend to use dosemu with a non-compressed MSDOS partition, say Y + here) and MSDOS floppies. This means that file access becomes + transparent, i.e. the MSDOS files look and behave just like all + other Unix files. + + If you want to use UMSDOS, the Unix-like file system on top of a + DOS file system, which allows you to run Linux from within a DOS + partition without repartitioning, you'll have to say Y or M here. + + If you have Windows 95 or Windows NT installed on your MSDOS + partitions, you should use the VFAT file system (say Y to "VFAT fs + support" below), or you will not be able to see the long filenames + generated by Windows 95 / Windows NT. + + This option will enlarge your kernel by about 7 KB. If unsure, + answer Y. This will only work if you said Y to "DOS FAT fs support" + as well. To compile this as a module, choose M here: the module will + be called msdos. + +config VFAT_FS + tristate "VFAT (Windows-95) fs support" + depends on FAT_FS + help + This option provides support for normal Windows file systems with + long filenames. That includes non-compressed FAT-based file systems + used by Windows 95, Windows 98, Windows NT 4.0, and the Unix + programs from the mtools package. + + You cannot use the VFAT file system for your Linux root partition + (the one containing the directory /); use UMSDOS instead if you + want to run Linux from within a DOS partition (i.e. say Y to + "Unix like fs on top of std MSDOS fs", below). + + The VFAT support enlarges your kernel by about 10 KB and it only + works if you said Y to the "DOS FAT fs support" above. Please read + the file <file:Documentation/filesystems/vfat.txt> for details. If + unsure, say Y. + + To compile this as a module, choose M here: the module will be called + vfat. + +config UMSDOS_FS +#dep_tristate ' UMSDOS: Unix-like file system on top of standard MSDOS fs' CONFIG_UMSDOS_FS $CONFIG_MSDOS_FS +# UMSDOS is temprory broken + bool + help + Say Y here if you want to run Linux from within an existing DOS + partition of your hard drive. The advantage of this is that you can + get away without repartitioning your hard drive (which often implies + backing everything up and restoring afterwards) and hence you're + able to quickly try out Linux or show it to your friends; the + disadvantage is that Linux becomes susceptible to DOS viruses and + that UMSDOS is somewhat slower than ext2fs. Another use of UMSDOS + is to write files with long unix filenames to MSDOS floppies; it + also allows Unix-style soft-links and owner/permissions of files on + MSDOS floppies. You will need a program called umssync in order to + make use of UMSDOS; read + <file:Documentation/filesystems/umsdos.txt>. + + To get utilities for initializing/checking UMSDOS file system, or + latest patches and/or information, visit the UMSDOS home page at + <http://www.voyager.hr/~mnalis/umsdos/>. + + This option enlarges your kernel by about 28 KB and it only works if + you said Y to both "DOS FAT fs support" and "MSDOS fs support" + above. To compile this as a module, choose M here: the module will be + called umsdos. Note that the file system of your root partition + (the one containing the directory /) cannot be a module, so saying M + could be dangerous. If unsure, say N. + +config NTFS_FS + tristate "NTFS file system support" + help + NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. + + Saying Y or M here enables read support. There is partial, but + safe, write support available. For write support you must also + say Y to "NTFS write support" below. + + There are also a number of user-space tools available, called + ntfsprogs. These include ntfsundelete and ntfsresize, that work + without NTFS support enabled in the kernel. + + This is a rewrite from scratch of Linux NTFS support and replaced + the old NTFS code starting with Linux 2.5.11. A backport to + the Linux 2.4 kernel series is separately available as a patch + from the project web site. + + For more information see <file:Documentation/filesystems/ntfs.txt> + and <http://linux-ntfs.sourceforge.net/>. + + To compile this file system support as a module, choose M here: the + module will be called ntfs. + + If you are not using Windows NT, 2000, XP or 2003 in addition to + Linux on your computer it is safe to say N. + +config NTFS_DEBUG + bool "NTFS debugging support" + depends on NTFS_FS + help + If you are experiencing any problems with the NTFS file system, say + Y here. This will result in additional consistency checks to be + performed by the driver as well as additional debugging messages to + be written to the system log. Note that debugging messages are + disabled by default. To enable them, supply the option debug_msgs=1 + at the kernel command line when booting the kernel or as an option + to insmod when loading the ntfs module. Once the driver is active, + you can enable debugging messages by doing (as root): + echo 1 > /proc/sys/fs/ntfs-debug + Replacing the "1" with "0" would disable debug messages. + + If you leave debugging messages disabled, this results in little + overhead, but enabling debug messages results in very significant + slowdown of the system. + + When reporting bugs, please try to have available a full dump of + debugging messages while the misbehaviour was occurring. + +config NTFS_RW + bool "NTFS write support" + depends on NTFS_FS + help + This enables the partial, but safe, write support in the NTFS driver. + + The only supported operation is overwriting existing files, without + changing the file length. No file or directory creation, deletion or + renaming is possible. Note only non-resident files can be written to + so you may find that some very small files (<500 bytes or so) cannot + be written to. + + While we cannot guarantee that it will not damage any data, we have + so far not received a single report where the driver would have + damaged someones data so we assume it is perfectly safe to use. + + Note: While write support is safe in this version (a rewrite from + scratch of the NTFS support), it should be noted that the old NTFS + write support, included in Linux 2.5.10 and before (since 1997), + is not safe. + + This is currently useful with TopologiLinux. TopologiLinux is run + on top of any DOS/Microsoft Windows system without partitioning your + hard disk. Unlike other Linux distributions TopologiLinux does not + need its own partition. For more information see + <http://topologi-linux.sourceforge.net/> + + It is perfectly safe to say N here. + +endmenu + +menu "Pseudo filesystems" + +config PROC_FS + bool "/proc file system support" + help + This is a virtual file system providing information about the status + of the system. "Virtual" means that it doesn't take up any space on + your hard disk: the files are created on the fly by the kernel when + you try to access them. Also, you cannot read the files with older + version of the program less: you need to use more or cat. + + It's totally cool; for example, "cat /proc/interrupts" gives + information about what the different IRQs are used for at the moment + (there is a small number of Interrupt ReQuest lines in your computer + that are used by the attached devices to gain the CPU's attention -- + often a source of trouble if two devices are mistakenly configured + to use the same IRQ). The program procinfo to display some + information about your system gathered from the /proc file system. + + Before you can use the /proc file system, it has to be mounted, + meaning it has to be given a location in the directory hierarchy. + That location should be /proc. A command such as "mount -t proc proc + /proc" or the equivalent line in /etc/fstab does the job. + + The /proc file system is explained in the file + <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage + ("man 5 proc"). + + This option will enlarge your kernel by about 67 KB. Several + programs depend on this, so everyone should say Y here. + +config PROC_KCORE + bool + default y if !ARM + +config DEVFS_FS + bool "/dev file system support (OBSOLETE)" + depends on EXPERIMENTAL + help + This is support for devfs, a virtual file system (like /proc) which + provides the file system interface to device drivers, normally found + in /dev. Devfs does not depend on major and minor number + allocations. Device drivers register entries in /dev which then + appear automatically, which means that the system administrator does + not have to create character and block special device files in the + /dev directory using the mknod command (or MAKEDEV script) anymore. + + This is work in progress. If you want to use this, you *must* read + the material in <file:Documentation/filesystems/devfs/>, especially + the file README there. + + Note that devfs no longer manages /dev/pts! If you are using UNIX98 + ptys, you will also need to enable (and mount) the /dev/pts + filesystem (CONFIG_DEVPTS_FS). + + Note that devfs has been obsoleted by udev, + <http://www.kernel.org/pub/linux/utils/kernel/hotplug/>. + It has been stripped down to a bare minimum and is only provided for + legacy installations that use its naming scheme which is + unfortunately different from the names normal Linux installations + use. + + If unsure, say N. + +config DEVFS_MOUNT + bool "Automatically mount at boot" + depends on DEVFS_FS + help + This option appears if you have CONFIG_DEVFS_FS enabled. Setting + this to 'Y' will make the kernel automatically mount devfs onto /dev + when the system is booted, before the init thread is started. + You can override this with the "devfs=nomount" boot option. + + If unsure, say N. + +config DEVFS_DEBUG + bool "Debug devfs" + depends on DEVFS_FS + help + If you say Y here, then the /dev file system code will generate + debugging messages. See the file + <file:Documentation/filesystems/devfs/boot-options> for more + details. + + If unsure, say N. + +config DEVPTS_FS +# It compiles as a module for testing only. It should not be used +# as a module in general. If we make this "tristate", a bunch of people +# who don't know what they are doing turn it on and complain when it +# breaks. + bool "/dev/pts file system for Unix98 PTYs" + depends on UNIX98_PTYS + ---help--- + You should say Y here if you said Y to "Unix98 PTY support" above. + You'll then get a virtual file system which can be mounted on + /dev/pts with "mount -t devpts". This, together with the pseudo + terminal master multiplexer /dev/ptmx, is used for pseudo terminal + support as described in The Open Group's Unix98 standard: in order + to acquire a pseudo terminal, a process opens /dev/ptmx; the number + of the pseudo terminal is then made available to the process and the + pseudo terminal slave can be accessed as /dev/pts/<number>. What was + traditionally /dev/ttyp2 will then be /dev/pts/2, for example. + + The GNU C library glibc 2.1 contains the requisite support for this + mode of operation; you also need client programs that use the Unix98 + API. Please read <file:Documentation/Changes> for more information + about the Unix98 pty devices. + +config DEVPTS_FS_XATTR + bool "/dev/pts Extended Attributes" + depends on DEVPTS_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + If unsure, say N. + +config DEVPTS_FS_SECURITY + bool "/dev/pts Security Labels" + depends on DEVPTS_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the /dev/pts filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config TMPFS + bool "Virtual memory file system support (former shm fs)" + help + Tmpfs is a file system which keeps all files in virtual memory. + + Everything in tmpfs is temporary in the sense that no files will be + created on your hard drive. The files live in memory and swap + space. If you unmount a tmpfs instance, everything stored therein is + lost. + + See <file:Documentation/filesystems/tmpfs.txt> for details. + +config HUGETLBFS + bool "HugeTLB file system support" + depends X86 || IA64 || PPC64 || SPARC64 || X86_64 || BROKEN + +config HUGETLB_PAGE + def_bool HUGETLBFS + +config RAMFS + bool + default y + ---help--- + Ramfs is a file system which keeps all files in RAM. It allows + read and write access. + + It is more of an programming example than a useable file system. If + you need a file system which lives in RAM with limit checking use + tmpfs. + + To compile this as a module, choose M here: the module will be called + ramfs. + +endmenu + +menu "Miscellaneous filesystems" + +config ADFS_FS + tristate "ADFS file system support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + The Acorn Disc Filing System is the standard file system of the + RiscOS operating system which runs on Acorn's ARM-based Risc PC + systems and the Acorn Archimedes range of machines. If you say Y + here, Linux will be able to read from ADFS partitions on hard drives + and from ADFS-formatted floppy discs. If you also want to be able to + write to those devices, say Y to "ADFS write support" below. + + The ADFS partition should be the first partition (i.e., + /dev/[hs]d?1) on each of your drives. Please read the file + <file:Documentation/filesystems/adfs.txt> for further details. + + To compile this code as a module, choose M here: the module will be + called adfs. + + If unsure, say N. + +config ADFS_FS_RW + bool "ADFS write support (DANGEROUS)" + depends on ADFS_FS + help + If you say Y here, you will be able to write to ADFS partitions on + hard drives and ADFS-formatted floppy disks. This is experimental + codes, so if you're unsure, say N. + +config AFFS_FS + tristate "Amiga FFS file system support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + The Fast File System (FFS) is the common file system used on hard + disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20). Say Y + if you want to be able to read and write files from and to an Amiga + FFS partition on your hard drive. Amiga floppies however cannot be + read with this driver due to an incompatibility of the floppy + controller used in an Amiga and the standard floppy controller in + PCs and workstations. Read <file:Documentation/filesystems/affs.txt> + and <file:fs/affs/Changes>. + + With this driver you can also mount disk files used by Bernd + Schmidt's Un*X Amiga Emulator + (<http://www.freiburg.linux.de/~uae/>). + If you want to do this, you will also need to say Y or M to "Loop + device support", above. + + To compile this file system support as a module, choose M here: the + module will be called affs. If unsure, say N. + +config HFS_FS + tristate "Apple Macintosh file system support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + If you say Y here, you will be able to mount Macintosh-formatted + floppy disks and hard drive partitions with full read-write access. + Please read <file:fs/hfs/HFS.txt> to learn about the available mount + options. + + To compile this file system support as a module, choose M here: the + module will be called hfs. + +config BEFS_FS + tristate "BeOS file systemv(BeFS) support (read only) (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + The BeOS File System (BeFS) is the native file system of Be, Inc's + BeOS. Notable features include support for arbitrary attributes + on files and directories, and database-like indices on selected + attributes. (Also note that this driver doesn't make those features + available at this time). It is a 64 bit filesystem, so it supports + extreemly large volumes and files. + + If you use this filesystem, you should also say Y to at least one + of the NLS (native language support) options below. + + If you don't know what this is about, say N. + + To compile this as a module, choose M here: the module will be + called befs. + +config BEFS_DEBUG + bool "Debug BeFS" + depends on BEFS_FS + help + If you say Y here, you can use the 'debug' mount option to enable + debugging output from the driver. + +config BFS_FS + tristate "BFS file system support (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Boot File System (BFS) is a file system used under SCO UnixWare to + allow the bootloader access to the kernel image and other important + files during the boot process. It is usually mounted under /stand + and corresponds to the slice marked as "STAND" in the UnixWare + partition. You should say Y if you want to read or write the files + on your /stand slice from within Linux. You then also need to say Y + to "UnixWare slices support", below. More information about the BFS + file system is contained in the file + <file:Documentation/filesystems/bfs.txt>. + + If you don't know what this is about, say N. + + To compile this as a module, choose M here: the module will be called + bfs. Note that the file system of your root partition (the one + containing the directory /) cannot be compiled as a module. + + + +config EFS_FS + tristate "EFS file system support (read only) (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + EFS is an older file system used for non-ISO9660 CD-ROMs and hard + disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer + uses the XFS file system for hard disk partitions however). + + This implementation only offers read-only access. If you don't know + what all this is about, it's safe to say N. For more information + about EFS see its home page at <http://aeschi.ch.eu.org/efs/>. + + To compile the EFS file system support as a module, choose M here: the + module will be called efs. + +config JFFS_FS + tristate "Journalling Flash File System (JFFS) support" + depends on MTD + help + JFFS is the Journaling Flash File System developed by Axis + Communications in Sweden, aimed at providing a crash/powerdown-safe + file system for disk-less embedded devices. Further information is + available at (<http://developer.axis.com/software/jffs/>). + +config JFFS_FS_VERBOSE + int "JFFS debugging verbosity (0 = quiet, 3 = noisy)" + depends on JFFS_FS + default "0" + help + Determines the verbosity level of the JFFS debugging messages. + +config JFFS_PROC_FS + bool "JFFS stats available in /proc filesystem" + depends on JFFS_FS && PROC + help + Enabling this option will cause statistics from mounted JFFS file systems + to be made available to the user in the /proc/fs/jffs/ directory. + +config JFFS2_FS + tristate "Journalling Flash File System v2 (JFFS2) support" + depends on MTD + select CRC32 + select ZLIB_INFLATE + select ZLIB_DEFLATE + help + JFFS2 is the second generation of the Journalling Flash File System + for use on diskless embedded devices. It provides improved wear + levelling, compression and support for hard links. You cannot use + this on normal block devices, only on 'MTD' devices. + + Further information on the design and implementation of JFFS2 is + available at <http://sources.redhat.com/jffs2/>. + +config JFFS2_FS_DEBUG + int "JFFS2 debugging verbosity (0 = quiet, 2 = noisy)" + depends on JFFS2_FS + default "0" + help + This controls the amount of debugging messages produced by the JFFS2 + code. Set it to zero for use in production systems. For evaluation, + testing and debugging, it's advisable to set it to one. This will + enable a few assertions and will print debugging messages at the + KERN_DEBUG loglevel, where they won't normally be visible. Level 2 + is unlikely to be useful - it enables extra debugging in certain + areas which at one point needed debugging, but when the bugs were + located and fixed, the detailed messages were relegated to level 2. + + If reporting bugs, please try to have available a full dump of the + messages at debug level 1 while the misbehaviour was occurring. + +config JFFS2_FS_NAND + bool "JFFS2 support for NAND flash (EXPERIMENTAL)" + depends on JFFS2_FS && EXPERIMENTAL + default n + help + This enables the experimental support for NAND flash in JFFS2. NAND + is a newer type of flash chip design than the traditional NOR flash, + with higher density but a handful of characteristics which make it + more interesting for the file system to use. Support for NAND flash + is not yet complete and may corrupt data. For further information, + including a link to the mailing list where details of the remaining + work to be completed for NAND flash support can be found, see the + JFFS2 web site at <http://sources.redhat.com/jffs2>. + + Say 'N' unless you have NAND flash and you are willing to test and + develop JFFS2 support for it. + +config CRAMFS + tristate "Compressed ROM file system support" + select ZLIB_INFLATE + help + Saying Y here includes support for CramFs (Compressed ROM File + System). CramFs is designed to be a simple, small, and compressed + file system for ROM based embedded systems. CramFs is read-only, + limited to 256MB file systems (with 16MB files), and doesn't support + 16/32 bits uid/gid, hard links and timestamps. + + See <file:Documentation/filesystems/cramfs.txt> and + <file:fs/cramfs/README> for further information. + + To compile this as a module, choose M here: the module will be called + cramfs. Note that the root file system (the one containing the + directory /) cannot be compiled as a module. + + If unsure, say N. + +config VXFS_FS + tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" + help + FreeVxFS is a file system driver that support the VERITAS VxFS(TM) + file system format. VERITAS VxFS(TM) is the standard file system + of SCO UnixWare (and possibly others) and optionally available + for Sunsoft Solaris, HP-UX and many other operating systems. + Currently only readonly access is supported. + + NOTE: the file system type as used by mount(1), mount(2) and + fstab(5) is 'vxfs' as it describes the file system format, not + the actual driver. + + To compile this as a module, choose M here: the module will be + called freevxfs. If unsure, say N. + + +config HPFS_FS + tristate "OS/2 HPFS file system support" + help + OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS + is the file system used for organizing files on OS/2 hard disk + partitions. Say Y if you want to be able to read files from and + write files to an OS/2 HPFS partition on your hard drive. OS/2 + floppies however are in regular MSDOS format, so you don't need this + option in order to be able to read them. Read + <file:Documentation/filesystems/hpfs.txt>. + + To compile this file system support as a module, choose M here: the + module will be called hpfs. If unsure, say N. + + + +config QNX4FS_FS + tristate "QNX4 file system support (read only)" + help + This is the file system used by the real-time operating systems + QNX 4 and QNX 6 (the latter is also called QNX RTP). + Further information is available at <http://www.qnx.com/>. + Say Y if you intend to mount QNX hard disks or floppies. + Unless you say Y to "QNX4FS read-write support" below, you will + only be able to read these file systems. + + To compile this file system support as a module, choose M here: the + module will be called qnx4. + + If you don't know whether you need it, then you don't need it: + answer N. + +config QNX4FS_RW + bool "QNX4FS write support (DANGEROUS)" + depends on QNX4FS_FS && EXPERIMENTAL + help + Say Y if you want to test write support for QNX4 file systems. + + It's currently broken, so for now: + answer N. + + + +config SYSV_FS + tristate "System V/Xenix/V7/Coherent file system support" + help + SCO, Xenix and Coherent are commercial Unix systems for Intel + machines, and Version 7 was used on the DEC PDP-11. Saying Y + here would allow you to read from their floppies and hard disk + partitions. + + If you have floppies or hard disk partitions like that, it is likely + that they contain binaries from those other Unix systems; in order + to run these binaries, you will want to install linux-abi which is a + a set of kernel modules that lets you run SCO, Xenix, Wyse, + UnixWare, Dell Unix and System V programs under Linux. It is + available via FTP (user: ftp) from + <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>). + NOTE: that will work only for binaries from Intel-based systems; + PDP ones will have to wait until somebody ports Linux to -11 ;-) + + If you only intend to mount files from some other Unix over the + network using NFS, you don't need the System V file system support + (but you need NFS file system support obviously). + + Note that this option is generally not needed for floppies, since a + good portable way to transport files and directories between unixes + (and even other operating systems) is given by the tar program ("man + tar" or preferably "info tar"). Note also that this option has + nothing whatsoever to do with the option "System V IPC". Read about + the System V file system in + <file:Documentation/filesystems/sysv-fs.txt>. + Saying Y here will enlarge your kernel by about 27 KB. + + To compile this as a module, choose M here: the module will be called + sysv. + + If you haven't heard about all of this before, it's safe to say N. + + + +config UFS_FS + tristate "UFS file system support (read only)" + help + BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, + OpenBSD and NeXTstep) use a file system called UFS. Some System V + Unixes can create and mount hard disk partitions and diskettes using + this file system as well. Saying Y here will allow you to read from + these partitions; if you also want to write to them, say Y to the + experimental "UFS file system write support", below. Please read the + file <file:Documentation/filesystems/ufs.txt> for more information. + + If you only intend to mount files from some other Unix over the + network using NFS, you don't need the UFS file system support (but + you need NFS file system support obviously). + + Note that this option is generally not needed for floppies, since a + good portable way to transport files and directories between unixes + (and even other operating systems) is given by the tar program ("man + tar" or preferably "info tar"). + + When accessing NeXTstep files, you may need to convert them from the + NeXT character set to the Latin1 character set; use the program + recode ("info recode") for this purpose. + + To compile the UFS file system support as a module, choose M here: the + module will be called ufs. + + If you haven't heard about all of this before, it's safe to say N. + +config UFS_FS_WRITE + bool "UFS file system write support (DANGEROUS)" + depends on UFS_FS && EXPERIMENTAL + help + Say Y here if you want to try writing to UFS partitions. This is + experimental, so you should back up your UFS partitions beforehand. + +endmenu + +menu "Network File Systems" + depends on NET + +config NFS_FS + tristate "NFS file system support" + depends on INET + select LOCKD + select SUNRPC + help + If you are connected to some other (usually local) Unix computer + (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing + on that computer (the NFS server) using the Network File Sharing + protocol, say Y. "Mounting files" means that the client can access + the files with usual UNIX commands as if they were sitting on the + client's hard disk. For this to work, the server must run the + programs nfsd and mountd (but does not need to have NFS file system + support enabled in its kernel). NFS is explained in the Network + Administrator's Guide, available from + <http://www.tldp.org/docs.html#guide>, on its man page: "man + nfs", and in the NFS-HOWTO. + + A superior but less widely used alternative to NFS is provided by + the Coda file system; see "Coda file system support" below. + + If you say Y here, you should have said Y to TCP/IP networking also. + This option would enlarge your kernel by about 27 KB. + + To compile this file system support as a module, choose M here: the + module will be called nfs. + + If you are configuring a diskless machine which will mount its root + file system over NFS at boot time, say Y here and to "Kernel + level IP autoconfiguration" above and to "Root file system on NFS" + below. You cannot compile this driver as a module in this case. + There are two packages designed for booting diskless machines over + the net: netboot, available from + <http://ftp1.sourceforge.net/netboot/>, and Etherboot, + available from <http://ftp1.sourceforge.net/etherboot/>. + + If you don't know what all this is about, say N. + +config NFS_V3 + bool "Provide NFSv3 client support" + depends on NFS_FS + help + Say Y here if you want your NFS client to be able to speak the newer + version 3 of the NFS protocol. + + If unsure, say N. + +config NFS_V4 + bool "Provide NFSv4 client support (EXPERIMENTAL)" + depends on NFS_FS && EXPERIMENTAL + help + Say Y here if you want your NFS client to be able to speak the newer + version 4 of the NFS protocol. This feature is experimental, and + should only be used if you are interested in helping to test NFSv4. + + If unsure, say N. + +config NFS_DIRECTIO + bool "Allow direct I/O on NFS files (EXPERIMENTAL)" + depends on NFS_FS && EXPERIMENTAL + help + This option enables applications to perform uncached I/O on files + in NFS file systems using the O_DIRECT open() flag. When O_DIRECT + is set for a file, its data is not cached in the system's page + cache. Data is moved to and from user-level application buffers + directly. Unlike local disk-based file systems, NFS O_DIRECT has + no alignment restrictions. + + Unless your program is designed to use O_DIRECT properly, you are + much better off allowing the NFS client to manage data caching for + you. Misusing O_DIRECT can cause poor server performance or network + storms. This kernel build option defaults OFF to avoid exposing + system administrators unwittingly to a potentially hazardous + feature. + + For more details on NFS O_DIRECT, see fs/nfs/direct.c. + + If unsure, say N. This reduces the size of the NFS client, and + causes open() to return EINVAL if a file residing in NFS is + opened with the O_DIRECT flag. + +config NFSD + tristate "NFS server support" + depends on INET + select LOCKD + select SUNRPC + help + If you want your Linux box to act as an NFS *server*, so that other + computers on your local network which support NFS can access certain + directories on your box transparently, you have two options: you can + use the self-contained user space program nfsd, in which case you + should say N here, or you can say Y and use the kernel based NFS + server. The advantage of the kernel based solution is that it is + faster. + + In either case, you will need support software; the respective + locations are given in the file <file:Documentation/Changes> in the + NFS section. + + If you say Y here, you will get support for version 2 of the NFS + protocol (NFSv2). If you also want NFSv3, say Y to the next question + as well. + + Please read the NFS-HOWTO, available from + <http://www.tldp.org/docs.html#howto>. + + To compile the NFS server support as a module, choose M here: the + module will be called nfsd. If unsure, say N. + +config NFSD_V3 + bool "Provide NFSv3 server support" + depends on NFSD + help + If you would like to include the NFSv3 server as well as the NFSv2 + server, say Y here. If unsure, say Y. + +config NFSD_V4 + bool "Provide NFSv4 server support (EXPERIMENTAL)" + depends on NFSD_V3 && EXPERIMENTAL + help + If you would like to include the NFSv4 server as well as the NFSv2 + and NFSv3 servers, say Y here. This feature is experimental, and + should only be used if you are interested in helping to test NFSv4. + If unsure, say N. + +config NFSD_TCP + bool "Provide NFS server over TCP support (EXPERIMENTAL)" + depends on NFSD && EXPERIMENTAL + help + Enable NFS service over TCP connections. This the officially + still experimental, but seems to work well. + +config ROOT_NFS + bool "Root file system on NFS" + depends on NFS_FS=y && IP_PNP + help + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS (presumably because your box doesn't have a hard disk), + say Y. Read <file:Documentation/nfsroot.txt> for details. It is + likely that in this case, you also want to say Y to "Kernel level IP + autoconfiguration" so that your box can discover its network address + at boot time. + + Most people say N here. + +config LOCKD + tristate + +config LOCKD_V4 + bool + depends on NFSD_V3 || NFS_V3 + default y + +config EXPORTFS + tristate + default NFSD + +config SUNRPC + tristate + +config SUNRPC_GSS + tristate "Provide RPCSEC_GSS authentication (EXPERIMENTAL)" + depends on SUNRPC && EXPERIMENTAL + default SUNRPC if NFS_V4=y + help + Provides cryptographic authentication for NFS rpc requests. To + make this useful, you must also select at least one rpcsec_gss + mechanism. + Note: You should always select this option if you wish to use + NFSv4. + +config RPCSEC_GSS_KRB5 + tristate "Kerberos V mechanism for RPCSEC_GSS (EXPERIMENTAL)" + depends on SUNRPC_GSS && CRYPTO_DES && CRYPTO_MD5 + default SUNRPC_GSS if NFS_V4=y + help + Provides a gss-api mechanism based on Kerberos V5 (this is + mandatory for RFC3010-compliant NFSv4 implementations). + Requires a userspace daemon; + see http://www.citi.umich.edu/projects/nfsv4/. + + Note: If you select this option, please ensure that you also + enable the MD5 and DES crypto ciphers. + +config SMB_FS + tristate "SMB file system support (to mount Windows shares etc.)" + depends on INET + help + SMB (Server Message Block) is the protocol Windows for Workgroups + (WfW), Windows 95/98, Windows NT and OS/2 Lan Manager use to share + files and printers over local networks. Saying Y here allows you to + mount their file systems (often called "shares" in this context) and + access them just like any other Unix directory. Currently, this + works only if the Windows machines use TCP/IP as the underlying + transport protocol, and not NetBEUI. For details, read + <file:Documentation/filesystems/smbfs.txt> and the SMB-HOWTO, + available from <http://www.tldp.org/docs.html#howto>. + + Note: if you just want your box to act as an SMB *server* and make + files and printing services available to Windows clients (which need + to have a TCP/IP stack), you don't need to say Y here; you can use + the program SAMBA (available from <ftp://ftp.samba.org/pub/samba/>) + for that. + + General information about how to connect Linux, Windows machines and + Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. + + To compile the SMB support as a module, choose M here: the module will + be called smbfs. Most people say N, however. + +config SMB_NLS_DEFAULT + bool "Use a default NLS" + depends on SMB_FS + help + Enabling this will make smbfs use nls translations by default. You + need to specify the local charset (CONFIG_NLS_DEFAULT) in the nls + settings and you need to give the default nls for the SMB server as + CONFIG_SMB_NLS_REMOTE. + + The nls settings can be changed at mount time, if your smbmount + supports that, using the codepage and iocharset parameters. + + smbmount from samba 2.2.0 or later supports this. + +config SMB_NLS_REMOTE + string "Default Remote NLS Option" + depends on SMB_NLS_DEFAULT + default "cp437" + help + This setting allows you to specify a default value for which + codepage the server uses. If this field is left blank no + translations will be done by default. The local codepage/charset + default to CONFIG_NLS_DEFAULT. + + The nls settings can be changed at mount time, if your smbmount + supports that, using the codepage and iocharset parameters. + + smbmount from samba 2.2.0 or later supports this. + +config CIFS + tristate "CIFS support (advanced network filesystem for Samba, Window and other CIFS compliant servers)(EXPERIMENTAL)" + depends on INET + help + This is the client VFS module for the Common Internet File System + (CIFS) protocol which is the successor to the Server Message Block + (SMB) protocol, the native file sharing mechanism for most early + PC operating systems. CIFS is fully supported by current network + file servers such as Windows 2000 (including Windows NT version 4 + and Windows XP) as well by Samba (which provides excellent CIFS + server support for Linux and many other operating systems). For + production systems the smbfs module may be used instead of this + cifs module since smbfs is currently more stable and provides + support for older servers. The intent of this module is to provide the + most advanced network file system function for CIFS compliant servers, + including support for dfs (hierarchical name space), secure per-user + session establishment, safe distributed caching (oplock), optional + packet signing, Unicode and other internationalization improvements, and + optional Winbind (nsswitch) integration. This module is in an early + development stage, so unless you are specifically interested in this + filesystem, just say N. + +config NCP_FS + tristate "NCP file system support (to mount NetWare volumes)" + depends on IPX!=n || INET + help + NCP (NetWare Core Protocol) is a protocol that runs over IPX and is + used by Novell NetWare clients to talk to file servers. It is to + IPX what NFS is to TCP/IP, if that helps. Saying Y here allows you + to mount NetWare file server volumes and to access them just like + any other Unix directory. For details, please read the file + <file:Documentation/filesystems/ncpfs.txt> in the kernel source and + the IPX-HOWTO from <http://www.tldp.org/docs.html#howto>. + + You do not have to say Y here if you want your Linux box to act as a + file *server* for Novell NetWare clients. + + General information about how to connect Linux, Windows machines and + Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. + + To compile this as a module, choose M here: the module will be called + ncpfs. Say N unless you are connected to a Novell network. + +source "fs/ncpfs/Kconfig" + +config CODA_FS + tristate "Coda file system support (advanced network fs)" + depends on INET + help + Coda is an advanced network file system, similar to NFS in that it + enables you to mount file systems of a remote server and access them + with regular Unix commands as if they were sitting on your hard + disk. Coda has several advantages over NFS: support for + disconnected operation (e.g. for laptops), read/write server + replication, security model for authentication and encryption, + persistent client caches and write back caching. + + If you say Y here, your Linux box will be able to act as a Coda + *client*. You will need user level code as well, both for the + client and server. Servers are currently user level, i.e. they need + no kernel support. Please read + <file:Documentation/filesystems/coda.txt> and check out the Coda + home page <http://www.coda.cs.cmu.edu/>. + + To compile the coda client support as a module, choose M here: the + module will be called coda. + +config CODA_FS_OLD_API + bool "Use 96-bit Coda file identifiers" + depends on CODA_FS + help + A new kernel-userspace API had to be introduced for Coda v6.0 + to support larger 128-bit file identifiers as needed by the + new realms implementation. + + However this new API is not backward compatible with older + clients. If you really need to run the old Coda userspace + cache manager then say Y. + + For most cases you probably want to say N. + +config INTERMEZZO_FS + tristate "InterMezzo file system support (replicating fs) (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + help + InterMezzo is a networked file system with disconnected operation + and kernel level write back caching. It is most often used for + replicating potentially large trees or keeping laptop/desktop copies + in sync. + + If you say Y or M your kernel or module will provide InterMezzo + support. You will also need a file server daemon, which you can get + from <http://www.inter-mezzo.org/>. + +config AFS_FS +# for fs/nls/Config.in + tristate "Andrew File System support (AFS) (Experimental)" + depends on INET && EXPERIMENTAL + select RXRPC + help + If you say Y here, you will get an experimental Andrew File System + driver. It currently only supports unsecured read-only AFS access. + + See Documentation/filesystems/afs.txt for more intormation. + + If unsure, say N. + +config RXRPC + tristate + +endmenu + +menu "Partition Types" + +source "fs/partitions/Kconfig" + +endmenu + +source "fs/nls/Kconfig" + +endmenu + diff -Nru a/fs/Makefile b/fs/Makefile --- a/fs/Makefile Fri Oct 31 14:10:54 2003 +++ b/fs/Makefile Fri Oct 31 14:10:54 2003 @@ -85,9 +85,12 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ +obj-$(CONFIG_REISER4_FS) += reiser4/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_JFS_FS) += jfs/ obj-$(CONFIG_XFS_FS) += xfs/ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_BEFS_FS) += befs/ +obj-$(CONFIG_HOSTFS) += hostfs/ +obj-$(CONFIG_HPPFS) += hppfs/ diff -Nru a/fs/Makefile~reiser4-fs-Makefile.diff b/fs/Makefile~reiser4-fs-Makefile.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/Makefile~reiser4-fs-Makefile.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,95 @@ +# +# Makefile for the Linux filesystems. +# +# 14 Sep 2000, Christoph Hellwig <hch@infradead.org> +# Rewritten to use lists instead of if-statements. +# + +obj-y := open.o read_write.o file_table.o buffer.o \ + bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \ + namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ + dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \ + filesystems.o namespace.o seq_file.o xattr.o libfs.o \ + fs-writeback.o mpage.o direct-io.o aio.o + +obj-$(CONFIG_EPOLL) += eventpoll.o +obj-$(CONFIG_COMPAT) += compat.o + +nfsd-$(CONFIG_NFSD) := nfsctl.o +obj-y += $(nfsd-y) $(nfsd-m) + +obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o +obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o +obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o + +# binfmt_script is always there +obj-y += binfmt_script.o + +obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o +obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o +obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o + +obj-$(CONFIG_FS_MBCACHE) += mbcache.o +obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o + +obj-$(CONFIG_QUOTA) += dquot.o +obj-$(CONFIG_QFMT_V1) += quota_v1.o +obj-$(CONFIG_QFMT_V2) += quota_v2.o +obj-$(CONFIG_QUOTACTL) += quota.o + +obj-$(CONFIG_PROC_FS) += proc/ +obj-y += partitions/ +obj-y += sysfs/ +obj-y += devpts/ + +obj-$(CONFIG_PROFILING) += dcookies.o + +# Do not add any filesystems before this line +obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 +obj-$(CONFIG_JBD) += jbd/ +obj-$(CONFIG_EXT2_FS) += ext2/ +obj-$(CONFIG_CRAMFS) += cramfs/ +obj-$(CONFIG_RAMFS) += ramfs/ +obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ +obj-$(CONFIG_CODA_FS) += coda/ +obj-$(CONFIG_INTERMEZZO_FS) += intermezzo/ +obj-$(CONFIG_MINIX_FS) += minix/ +obj-$(CONFIG_FAT_FS) += fat/ +obj-$(CONFIG_UMSDOS_FS) += umsdos/ +obj-$(CONFIG_MSDOS_FS) += msdos/ +obj-$(CONFIG_VFAT_FS) += vfat/ +obj-$(CONFIG_BFS_FS) += bfs/ +obj-$(CONFIG_ISO9660_FS) += isofs/ +obj-$(CONFIG_DEVFS_FS) += devfs/ +obj-$(CONFIG_HFS_FS) += hfs/ +obj-$(CONFIG_VXFS_FS) += freevxfs/ +obj-$(CONFIG_NFS_FS) += nfs/ +obj-$(CONFIG_EXPORTFS) += exportfs/ +obj-$(CONFIG_NFSD) += nfsd/ +obj-$(CONFIG_LOCKD) += lockd/ +obj-$(CONFIG_NLS) += nls/ +obj-$(CONFIG_SYSV_FS) += sysv/ +obj-$(CONFIG_SMB_FS) += smbfs/ +obj-$(CONFIG_CIFS) += cifs/ +obj-$(CONFIG_NCP_FS) += ncpfs/ +obj-$(CONFIG_HPFS_FS) += hpfs/ +obj-$(CONFIG_NTFS_FS) += ntfs/ +obj-$(CONFIG_UFS_FS) += ufs/ +obj-$(CONFIG_EFS_FS) += efs/ +obj-$(CONFIG_JFFS_FS) += jffs/ +obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_AFFS_FS) += affs/ +obj-$(CONFIG_ROMFS_FS) += romfs/ +obj-$(CONFIG_QNX4FS_FS) += qnx4/ +obj-$(CONFIG_AUTOFS_FS) += autofs/ +obj-$(CONFIG_AUTOFS4_FS) += autofs4/ +obj-$(CONFIG_ADFS_FS) += adfs/ +obj-$(CONFIG_REISERFS_FS) += reiserfs/ +obj-$(CONFIG_UDF_FS) += udf/ +obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ +obj-$(CONFIG_JFS_FS) += jfs/ +obj-$(CONFIG_XFS_FS) += xfs/ +obj-$(CONFIG_AFS_FS) += afs/ +obj-$(CONFIG_BEFS_FS) += befs/ +obj-$(CONFIG_HOSTFS) += hostfs/ +obj-$(CONFIG_HPPFS) += hppfs/ diff -Nru a/fs/Makefile~uml-summa.diff b/fs/Makefile~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/Makefile~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,93 @@ +# +# Makefile for the Linux filesystems. +# +# 14 Sep 2000, Christoph Hellwig <hch@infradead.org> +# Rewritten to use lists instead of if-statements. +# + +obj-y := open.o read_write.o file_table.o buffer.o \ + bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \ + namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ + dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \ + filesystems.o namespace.o seq_file.o xattr.o libfs.o \ + fs-writeback.o mpage.o direct-io.o aio.o + +obj-$(CONFIG_EPOLL) += eventpoll.o +obj-$(CONFIG_COMPAT) += compat.o + +nfsd-$(CONFIG_NFSD) := nfsctl.o +obj-y += $(nfsd-y) $(nfsd-m) + +obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o +obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o +obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o + +# binfmt_script is always there +obj-y += binfmt_script.o + +obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o +obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o +obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o + +obj-$(CONFIG_FS_MBCACHE) += mbcache.o +obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o + +obj-$(CONFIG_QUOTA) += dquot.o +obj-$(CONFIG_QFMT_V1) += quota_v1.o +obj-$(CONFIG_QFMT_V2) += quota_v2.o +obj-$(CONFIG_QUOTACTL) += quota.o + +obj-$(CONFIG_PROC_FS) += proc/ +obj-y += partitions/ +obj-y += sysfs/ +obj-y += devpts/ + +obj-$(CONFIG_PROFILING) += dcookies.o + +# Do not add any filesystems before this line +obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 +obj-$(CONFIG_JBD) += jbd/ +obj-$(CONFIG_EXT2_FS) += ext2/ +obj-$(CONFIG_CRAMFS) += cramfs/ +obj-$(CONFIG_RAMFS) += ramfs/ +obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ +obj-$(CONFIG_CODA_FS) += coda/ +obj-$(CONFIG_INTERMEZZO_FS) += intermezzo/ +obj-$(CONFIG_MINIX_FS) += minix/ +obj-$(CONFIG_FAT_FS) += fat/ +obj-$(CONFIG_UMSDOS_FS) += umsdos/ +obj-$(CONFIG_MSDOS_FS) += msdos/ +obj-$(CONFIG_VFAT_FS) += vfat/ +obj-$(CONFIG_BFS_FS) += bfs/ +obj-$(CONFIG_ISO9660_FS) += isofs/ +obj-$(CONFIG_DEVFS_FS) += devfs/ +obj-$(CONFIG_HFS_FS) += hfs/ +obj-$(CONFIG_VXFS_FS) += freevxfs/ +obj-$(CONFIG_NFS_FS) += nfs/ +obj-$(CONFIG_EXPORTFS) += exportfs/ +obj-$(CONFIG_NFSD) += nfsd/ +obj-$(CONFIG_LOCKD) += lockd/ +obj-$(CONFIG_NLS) += nls/ +obj-$(CONFIG_SYSV_FS) += sysv/ +obj-$(CONFIG_SMB_FS) += smbfs/ +obj-$(CONFIG_CIFS) += cifs/ +obj-$(CONFIG_NCP_FS) += ncpfs/ +obj-$(CONFIG_HPFS_FS) += hpfs/ +obj-$(CONFIG_NTFS_FS) += ntfs/ +obj-$(CONFIG_UFS_FS) += ufs/ +obj-$(CONFIG_EFS_FS) += efs/ +obj-$(CONFIG_JFFS_FS) += jffs/ +obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_AFFS_FS) += affs/ +obj-$(CONFIG_ROMFS_FS) += romfs/ +obj-$(CONFIG_QNX4FS_FS) += qnx4/ +obj-$(CONFIG_AUTOFS_FS) += autofs/ +obj-$(CONFIG_AUTOFS4_FS) += autofs4/ +obj-$(CONFIG_ADFS_FS) += adfs/ +obj-$(CONFIG_REISERFS_FS) += reiserfs/ +obj-$(CONFIG_UDF_FS) += udf/ +obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ +obj-$(CONFIG_JFS_FS) += jfs/ +obj-$(CONFIG_XFS_FS) += xfs/ +obj-$(CONFIG_AFS_FS) += afs/ +obj-$(CONFIG_BEFS_FS) += befs/ diff -Nru a/fs/binfmt_misc.c b/fs/binfmt_misc.c --- a/fs/binfmt_misc.c Fri Oct 31 14:10:53 2003 +++ b/fs/binfmt_misc.c Fri Oct 31 14:10:53 2003 @@ -529,8 +529,8 @@ inode->u.generic_ip = e; inode->i_fop = &bm_entry_operations; - write_lock(&entries_lock); d_instantiate(dentry, inode); + write_lock(&entries_lock); list_add(&e->list, &entries); write_unlock(&entries_lock); diff -Nru a/fs/buffer.c b/fs/buffer.c --- a/fs/buffer.c Fri Oct 31 14:10:54 2003 +++ b/fs/buffer.c Fri Oct 31 14:10:54 2003 @@ -239,6 +239,7 @@ return sync_blockdev(sb->s_bdev); } +EXPORT_SYMBOL(fsync_super); /* * Write out and wait upon all dirty data associated with this diff -Nru a/fs/buffer.c~fsync_super.diff b/fs/buffer.c~fsync_super.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/buffer.c~fsync_super.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,3065 @@ +/* + * linux/fs/buffer.c + * + * Copyright (C) 1991, 1992, 2002 Linus Torvalds + */ + +/* + * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 + * + * Removed a lot of unnecessary code and simplified things now that + * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 + * + * Speed up hash, lru, and free list operations. Use gfp() for allocating + * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM + * + * Added 32k buffer block sizes - these are required older ARM systems. - RMK + * + * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/percpu.h> +#include <linux/slab.h> +#include <linux/smp_lock.h> +#include <linux/blkdev.h> +#include <linux/file.h> +#include <linux/quotaops.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/writeback.h> +#include <linux/hash.h> +#include <linux/suspend.h> +#include <linux/buffer_head.h> +#include <linux/bio.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <asm/bitops.h> + +static void invalidate_bh_lrus(void); + +#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) + +/* + * Hashed waitqueue_head's for wait_on_buffer() + */ +#define BH_WAIT_TABLE_ORDER 7 +static struct bh_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp bh_wait_queue_heads[1<<BH_WAIT_TABLE_ORDER]; + +/* + * Debug/devel support stuff + */ + +void __buffer_error(char *file, int line) +{ + static int enough; + + if (enough > 10) + return; + enough++; + printk("buffer layer error at %s:%d\n", file, line); +#ifndef CONFIG_KALLSYMS + printk("Pass this trace through ksymoops for reporting\n"); +#endif + dump_stack(); +} +EXPORT_SYMBOL(__buffer_error); + +inline void +init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) +{ + bh->b_end_io = handler; + bh->b_private = private; +} + +/* + * Return the address of the waitqueue_head to be used for this + * buffer_head + */ +wait_queue_head_t *bh_waitq_head(struct buffer_head *bh) +{ + return &bh_wait_queue_heads[hash_ptr(bh, BH_WAIT_TABLE_ORDER)].wqh; +} +EXPORT_SYMBOL(bh_waitq_head); + +void wake_up_buffer(struct buffer_head *bh) +{ + wait_queue_head_t *wq = bh_waitq_head(bh); + + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} +EXPORT_SYMBOL(wake_up_buffer); + +void unlock_buffer(struct buffer_head *bh) +{ + /* + * unlock_buffer against a zero-count bh is a bug, if the page + * is not locked. Because then nothing protects the buffer's + * waitqueue, which is used here. (Well. Other locked buffers + * against the page will pin it. But complain anyway). + */ + if (atomic_read(&bh->b_count) == 0 && + !PageLocked(bh->b_page) && + !PageWriteback(bh->b_page)) + buffer_error(); + + clear_buffer_locked(bh); + smp_mb__after_clear_bit(); + wake_up_buffer(bh); +} + +/* + * Block until a buffer comes unlocked. This doesn't stop it + * from becoming locked again - you have to lock it yourself + * if you want to preserve its state. + */ +void __wait_on_buffer(struct buffer_head * bh) +{ + wait_queue_head_t *wqh = bh_waitq_head(bh); + DEFINE_WAIT(wait); + + if (atomic_read(&bh->b_count) == 0 && + (!bh->b_page || !PageLocked(bh->b_page))) + buffer_error(); + + do { + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + if (buffer_locked(bh)) { + blk_run_queues(); + io_schedule(); + } + } while (buffer_locked(bh)); + finish_wait(wqh, &wait); +} + +static void +__set_page_buffers(struct page *page, struct buffer_head *head) +{ + if (page_has_buffers(page)) + buffer_error(); + page_cache_get(page); + SetPagePrivate(page); + page->private = (unsigned long)head; +} + +static void +__clear_page_buffers(struct page *page) +{ + ClearPagePrivate(page); + page->private = 0; + page_cache_release(page); +} + +static void buffer_io_error(struct buffer_head *bh) +{ + char b[BDEVNAME_SIZE]; + + printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n", + bdevname(bh->b_bdev, b), + (unsigned long long)bh->b_blocknr); +} + +/* + * Default synchronous end-of-IO handler.. Just mark it up-to-date and + * unlock the buffer. This is what ll_rw_block uses too. + */ +void end_buffer_read_sync(struct buffer_head *bh, int uptodate) +{ + if (uptodate) { + set_buffer_uptodate(bh); + } else { + /* This happens, due to failed READA attempts. */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +void end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to I/O error on %s\n", + bdevname(bh->b_bdev, b)); + set_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +/* + * Write out and wait upon all the dirty data associated with a block + * device via its mapping. Does not take the superblock lock. + */ +int sync_blockdev(struct block_device *bdev) +{ + int ret = 0; + + if (bdev) { + int err; + + ret = filemap_fdatawrite(bdev->bd_inode->i_mapping); + err = filemap_fdatawait(bdev->bd_inode->i_mapping); + if (!ret) + ret = err; + } + return ret; +} +EXPORT_SYMBOL(sync_blockdev); + +/* + * Write out and wait upon all dirty data associated with this + * superblock. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_super(struct super_block *sb) +{ + sync_inodes_sb(sb, 0); + DQUOT_SYNC(sb); + lock_super(sb); + if (sb->s_dirt && sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + sync_blockdev(sb->s_bdev); + sync_inodes_sb(sb, 1); + + return sync_blockdev(sb->s_bdev); +} + +/* + * Write out and wait upon all dirty data associated with this + * device. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_bdev(struct block_device *bdev) +{ + struct super_block *sb = get_super(bdev); + if (sb) { + int res = fsync_super(sb); + drop_super(sb); + return res; + } + return sync_blockdev(bdev); +} + +/* + * sync everything. Start out by waking pdflush, because that writes back + * all queues in parallel. + */ +static void do_sync(unsigned long wait) +{ + wakeup_bdflush(0); + sync_inodes(0); /* All mappings, inodes and their blockdevs */ + DQUOT_SYNC(NULL); + sync_supers(); /* Write the superblocks */ + sync_filesystems(0); /* Start syncing the filesystems */ + sync_filesystems(wait); /* Waitingly sync the filesystems */ + sync_inodes(wait); /* Mappings, inodes and blockdevs, again. */ + if (!wait) + printk("Emergency Sync complete\n"); +} + +asmlinkage long sys_sync(void) +{ + do_sync(1); + return 0; +} + +void emergency_sync(void) +{ + pdflush_operation(do_sync, 0); +} + +/* + * Generic function to fsync a file. + * + * filp may be NULL if called via the msync of a vma. + */ + +int file_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + struct inode * inode = dentry->d_inode; + struct super_block * sb; + int ret; + + /* sync the inode to buffers */ + write_inode_now(inode, 0); + + /* sync the superblock to buffers */ + sb = inode->i_sb; + lock_super(sb); + if (sb->s_op->write_super) + sb->s_op->write_super(sb); + unlock_super(sb); + + /* .. finally sync the buffers to disk */ + ret = sync_blockdev(sb->s_bdev); + return ret; +} + +asmlinkage long sys_fsync(unsigned int fd) +{ + struct file * file; + struct dentry * dentry; + struct inode * inode; + int ret, err; + + ret = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + ret = -EINVAL; + if (!file->f_op || !file->f_op->fsync) { + /* Why? We can still call filemap_fdatawrite */ + goto out_putf; + } + + /* We need to protect against concurrent writers.. */ + down(&inode->i_sem); + current->flags |= PF_SYNCWRITE; + ret = filemap_fdatawrite(inode->i_mapping); + err = file->f_op->fsync(file, dentry, 0); + if (!ret) + ret = err; + err = filemap_fdatawait(inode->i_mapping); + if (!ret) + ret = err; + current->flags &= ~PF_SYNCWRITE; + up(&inode->i_sem); + +out_putf: + fput(file); +out: + return ret; +} + +asmlinkage long sys_fdatasync(unsigned int fd) +{ + struct file * file; + struct dentry * dentry; + struct inode * inode; + int ret, err; + + ret = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + ret = -EINVAL; + if (!file->f_op || !file->f_op->fsync) + goto out_putf; + + down(&inode->i_sem); + current->flags |= PF_SYNCWRITE; + ret = filemap_fdatawrite(inode->i_mapping); + err = file->f_op->fsync(file, dentry, 1); + if (!ret) + ret = err; + err = filemap_fdatawait(inode->i_mapping); + if (!ret) + ret = err; + current->flags &= ~PF_SYNCWRITE; + up(&inode->i_sem); + +out_putf: + fput(file); +out: + return ret; +} + +/* + * Various filesystems appear to want __find_get_block to be non-blocking. + * But it's the page lock which protects the buffers. To get around this, + * we get exclusion from try_to_free_buffers with the blockdev mapping's + * private_lock. + * + * Hack idea: for the blockdev mapping, i_bufferlist_lock contention + * may be quite high. This code could TryLock the page, and if that + * succeeds, there is no need to take private_lock. (But if + * private_lock is contended then so is mapping->page_lock). + */ +static struct buffer_head * +__find_get_block_slow(struct block_device *bdev, sector_t block, int unused) +{ + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct buffer_head *ret = NULL; + unsigned long index; + struct buffer_head *bh; + struct buffer_head *head; + struct page *page; + + index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); + page = find_get_page(bd_mapping, index); + if (!page) + goto out; + + spin_lock(&bd_mapping->private_lock); + if (!page_has_buffers(page)) + goto out_unlock; + head = page_buffers(page); + bh = head; + do { + if (bh->b_blocknr == block) { + ret = bh; + get_bh(bh); + goto out_unlock; + } + bh = bh->b_this_page; + } while (bh != head); + buffer_error(); + printk("block=%llu, b_blocknr=%llu\n", + (unsigned long long)block, (unsigned long long)bh->b_blocknr); + printk("b_state=0x%08lx, b_size=%u\n", bh->b_state, bh->b_size); +out_unlock: + spin_unlock(&bd_mapping->private_lock); + page_cache_release(page); +out: + return ret; +} + +/* If invalidate_buffers() will trash dirty buffers, it means some kind + of fs corruption is going on. Trashing dirty data always imply losing + information that was supposed to be just stored on the physical layer + by the user. + + Thus invalidate_buffers in general usage is not allwowed to trash + dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to + be preserved. These buffers are simply skipped. + + We also skip buffers which are still in use. For example this can + happen if a userspace program is reading the block device. + + NOTE: In the case where the user removed a removable-media-disk even if + there's still dirty data not synced on disk (due a bug in the device driver + or due an error of the user), by not destroying the dirty buffers we could + generate corruption also on the next media inserted, thus a parameter is + necessary to handle this case in the most safe way possible (trying + to not corrupt also the new disk inserted with the data belonging to + the old now corrupted disk). Also for the ramdisk the natural thing + to do in order to release the ramdisk memory is to destroy dirty buffers. + + These are two special cases. Normal usage imply the device driver + to issue a sync on the device (without waiting I/O completion) and + then an invalidate_buffers call that doesn't trash dirty buffers. + + For handling cache coherency with the blkdev pagecache the 'update' case + is been introduced. It is needed to re-read from disk any pinned + buffer. NOTE: re-reading from disk is destructive so we can do it only + when we assume nobody is changing the buffercache under our I/O and when + we think the disk contains more recent information than the buffercache. + The update == 1 pass marks the buffers we need to update, the update == 2 + pass does the actual I/O. */ +void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) +{ + invalidate_bh_lrus(); + /* + * FIXME: what about destroy_dirty_buffers? + * We really want to use invalidate_inode_pages2() for + * that, but not until that's cleaned up. + */ + invalidate_inode_pages(bdev->bd_inode->i_mapping); +} + +/* + * Kick pdflush then try to free up some ZONE_NORMAL memory. + */ +static void free_more_memory(void) +{ + struct zone *zone; + pg_data_t *pgdat; + + wakeup_bdflush(1024); + blk_run_queues(); + yield(); + + for_each_pgdat(pgdat) { + zone = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones[0]; + if (zone) + try_to_free_pages(zone, GFP_NOFS, 0); + } +} + +/* + * I/O completion handler for block_read_full_page() - pages + * which come unlocked at the end of I/O. + */ +static void end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + struct buffer_head *tmp; + struct page *page; + int page_uptodate = 1; + + BUG_ON(!buffer_async_read(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + clear_buffer_uptodate(bh); + buffer_io_error(bh); + SetPageError(page); + } + + /* + * Be _very_ careful from here on. Bad things can happen if + * two buffer heads end IO at almost the same time and both + * decide that the page is now completely done. + */ + spin_lock_irqsave(&page_uptodate_lock, flags); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } while (tmp != bh); + spin_unlock_irqrestore(&page_uptodate_lock, flags); + + /* + * If none of the buffers had errors and they are all + * uptodate then we can set the page uptodate. + */ + if (page_uptodate && !PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return; + +still_busy: + spin_unlock_irqrestore(&page_uptodate_lock, flags); + return; +} + +/* + * Completion handler for block_write_full_page() - pages which are unlocked + * during I/O, and which have PageWriteback cleared upon I/O completion. + */ +void end_buffer_async_write(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED; + unsigned long flags; + struct buffer_head *tmp; + struct page *page; + + BUG_ON(!buffer_async_write(bh)); + + page = bh->b_page; + if (uptodate) { + set_buffer_uptodate(bh); + } else { + buffer_io_error(bh); + printk(KERN_WARNING "lost page write due to I/O error on %s\n", + bdevname(bh->b_bdev, b)); + set_bit(AS_EIO, &page->mapping->flags); + clear_buffer_uptodate(bh); + SetPageError(page); + } + + spin_lock_irqsave(&page_uptodate_lock, flags); + clear_buffer_async_write(bh); + unlock_buffer(bh); + tmp = bh->b_this_page; + while (tmp != bh) { + if (buffer_async_write(tmp)) { + BUG_ON(!buffer_locked(tmp)); + goto still_busy; + } + tmp = tmp->b_this_page; + } + spin_unlock_irqrestore(&page_uptodate_lock, flags); + end_page_writeback(page); + return; + +still_busy: + spin_unlock_irqrestore(&page_uptodate_lock, flags); + return; +} + +/* + * If a page's buffers are under async readin (end_buffer_async_read + * completion) then there is a possibility that another thread of + * control could lock one of the buffers after it has completed + * but while some of the other buffers have not completed. This + * locked buffer would confuse end_buffer_async_read() into not unlocking + * the page. So the absence of BH_Async_Read tells end_buffer_async_read() + * that this buffer is not under async I/O. + * + * The page comes unlocked when it has no locked buffer_async buffers + * left. + * + * PageLocked prevents anyone starting new async I/O reads any of + * the buffers. + * + * PageWriteback is used to prevent simultaneous writeout of the same + * page. + * + * PageLocked prevents anyone from starting writeback of a page which is + * under read I/O (PageWriteback is only ever set against a locked page). + */ +void mark_buffer_async_read(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_async_read; + set_buffer_async_read(bh); +} +EXPORT_SYMBOL(mark_buffer_async_read); + +void mark_buffer_async_write(struct buffer_head *bh) +{ + bh->b_end_io = end_buffer_async_write; + set_buffer_async_write(bh); +} +EXPORT_SYMBOL(mark_buffer_async_write); + + +/* + * fs/buffer.c contains helper functions for buffer-backed address space's + * fsync functions. A common requirement for buffer-based filesystems is + * that certain data from the backing blockdev needs to be written out for + * a successful fsync(). For example, ext2 indirect blocks need to be + * written back and waited upon before fsync() returns. + * + * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * inode_has_buffers() and invalidate_inode_buffers() are provided for the + * management of a list of dependent buffers at ->i_mapping->private_list. + * + * Locking is a little subtle: try_to_free_buffers() will remove buffers + * from their controlling inode's queue when they are being freed. But + * try_to_free_buffers() will be operating against the *blockdev* mapping + * at the time, not against the S_ISREG file which depends on those buffers. + * So the locking for private_list is via the private_lock in the address_space + * which backs the buffers. Which is different from the address_space + * against which the buffers are listed. So for a particular address_space, + * mapping->private_lock does *not* protect mapping->private_list! In fact, + * mapping->private_list will always be protected by the backing blockdev's + * ->private_lock. + * + * Which introduces a requirement: all buffers on an address_space's + * ->private_list must be from the same address_space: the blockdev's. + * + * address_spaces which do not place buffers at ->private_list via these + * utility functions are free to use private_lock and private_list for + * whatever they want. The only requirement is that list_empty(private_list) + * be true at clear_inode() time. + * + * FIXME: clear_inode should not call invalidate_inode_buffers(). The + * filesystems should do that. invalidate_inode_buffers() should just go + * BUG_ON(!list_empty). + * + * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should + * take an address_space, not an inode. And it should be called + * mark_buffer_dirty_fsync() to clearly define why those buffers are being + * queued up. + * + * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the + * list if it is already on a list. Because if the buffer is on a list, + * it *must* already be on the right one. If not, the filesystem is being + * silly. This will save a ton of locking. But first we have to ensure + * that buffers are taken *off* the old inode's list when they are freed + * (presumably in truncate). That requires careful auditing of all + * filesystems (do it inside bforget()). It could also be done by bringing + * b_inode back. + */ + +void buffer_insert_list(spinlock_t *lock, + struct buffer_head *bh, struct list_head *list) +{ + spin_lock(lock); + list_move_tail(&bh->b_assoc_buffers, list); + spin_unlock(lock); +} + +/* + * The buffer's backing address_space's private_lock must be held + */ +static inline void __remove_assoc_queue(struct buffer_head *bh) +{ + list_del_init(&bh->b_assoc_buffers); +} + +int inode_has_buffers(struct inode *inode) +{ + return !list_empty(&inode->i_data.private_list); +} + +/* + * osync is designed to support O_SYNC io. It waits synchronously for + * all already-submitted IO to complete, but does not queue any new + * writes to the disk. + * + * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as + * you dirty the buffers, and then use osync_inode_buffers to wait for + * completion. Any other dirty buffers which are not yet queued for + * write will not be flushed to disk by the osync. + */ +static int osync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head *p; + int err = 0; + + spin_lock(lock); +repeat: + list_for_each_prev(p, list) { + bh = BH_ENTRY(p); + if (buffer_locked(bh)) { + get_bh(bh); + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + goto repeat; + } + } + spin_unlock(lock); + return err; +} + +/** + * sync_mapping_buffers - write out and wait upon a mapping's "associated" + * buffers + * @buffer_mapping - the mapping which backs the buffers' data + * @mapping - the mapping which wants those buffers written + * + * Starts I/O against the buffers at mapping->private_list, and waits upon + * that I/O. + * + * Basically, this is a convenience function for fsync(). @buffer_mapping is + * the blockdev which "owns" the buffers and @mapping is a file or directory + * which needs those buffers to be written for a successful fsync(). + */ +int sync_mapping_buffers(struct address_space *mapping) +{ + struct address_space *buffer_mapping = mapping->assoc_mapping; + + if (buffer_mapping == NULL || list_empty(&mapping->private_list)) + return 0; + + return fsync_buffers_list(&buffer_mapping->private_lock, + &mapping->private_list); +} +EXPORT_SYMBOL(sync_mapping_buffers); + +/* + * Called when we've recently written block `bblock', and it is known that + * `bblock' was for a buffer_boundary() buffer. This means that the block at + * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's + * dirty, schedule it for IO. So that indirects merge nicely with their data. + */ +void write_boundary_block(struct block_device *bdev, + sector_t bblock, unsigned blocksize) +{ + struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); + if (bh) { + if (buffer_dirty(bh)) + ll_rw_block(WRITE, 1, &bh); + put_bh(bh); + } +} + +void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct address_space *buffer_mapping = bh->b_page->mapping; + + mark_buffer_dirty(bh); + if (!mapping->assoc_mapping) { + mapping->assoc_mapping = buffer_mapping; + } else { + if (mapping->assoc_mapping != buffer_mapping) + BUG(); + } + if (list_empty(&bh->b_assoc_buffers)) + buffer_insert_list(&buffer_mapping->private_lock, + bh, &mapping->private_list); +} +EXPORT_SYMBOL(mark_buffer_dirty_inode); + +/* + * Add a page to the dirty page list. + * + * It is a sad fact of life that this function is called from several places + * deeply under spinlocking. It may not sleep. + * + * If the page has buffers, the uptodate buffers are set dirty, to preserve + * dirty-state coherency between the page and the buffers. It the page does + * not have buffers then when they are later attached they will all be set + * dirty. + * + * The buffers are dirtied before the page is dirtied. There's a small race + * window in which a writepage caller may see the page cleanness but not the + * buffer dirtiness. That's fine. If this code were to set the page dirty + * before the buffers, a concurrent writepage caller could clear the page dirty + * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean + * page on the dirty page list. + * + * There is also a small window where the page is dirty, and not on dirty_pages. + * Also a possibility that by the time the page is added to dirty_pages, it has + * been set clean. The page lists are somewhat approximate in this regard. + * It's better to have clean pages accidentally attached to dirty_pages than to + * leave dirty pages attached to clean_pages. + * + * We use private_lock to lock against try_to_free_buffers while using the + * page's buffer list. Also use this to protect against clean buffers being + * added to the page after it was set dirty. + * + * FIXME: may need to call ->reservepage here as well. That's rather up to the + * address_space though. + * + * For now, we treat swapper_space specially. It doesn't use the normal + * block a_ops. + */ +int __set_page_dirty_buffers(struct page *page) +{ + struct address_space * const mapping = page->mapping; + int ret = 0; + + if (mapping == NULL) { + SetPageDirty(page); + goto out; + } + + spin_lock(&mapping->private_lock); + if (page_has_buffers(page)) { + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + + do { + if (buffer_uptodate(bh)) + set_buffer_dirty(bh); + else + buffer_error(); + bh = bh->b_this_page; + } while (bh != head); + } + spin_unlock(&mapping->private_lock); + + if (!TestSetPageDirty(page)) { + spin_lock(&mapping->page_lock); + if (page->mapping) { /* Race with truncate? */ + if (!mapping->backing_dev_info->memory_backed) + inc_page_state(nr_dirty); + list_del(&page->list); + list_add(&page->list, &mapping->dirty_pages); + } + spin_unlock(&mapping->page_lock); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + } + +out: + return ret; +} +EXPORT_SYMBOL(__set_page_dirty_buffers); + +/* + * Write out and wait upon a list of buffers. + * + * We have conflicting pressures: we want to make sure that all + * initially dirty buffers get waited on, but that any subsequently + * dirtied buffers don't. After all, we don't want fsync to last + * forever if somebody is actively writing to the file. + * + * Do this in two main stages: first we copy dirty buffers to a + * temporary inode list, queueing the writes as we go. Then we clean + * up, waiting for those writes to complete. + * + * During this second stage, any subsequent updates to the file may end + * up refiling the buffer on the original inode's dirty list again, so + * there is a chance we will end up with a buffer queued for write but + * not yet completed on that list. So, as a final cleanup we go through + * the osync code to catch these locked, dirty buffers without requeuing + * any newly dirty buffers for write. + */ +int fsync_buffers_list(spinlock_t *lock, struct list_head *list) +{ + struct buffer_head *bh; + struct list_head tmp; + int err = 0, err2; + + INIT_LIST_HEAD(&tmp); + + spin_lock(lock); + while (!list_empty(list)) { + bh = BH_ENTRY(list->next); + list_del_init(&bh->b_assoc_buffers); + if (buffer_dirty(bh) || buffer_locked(bh)) { + list_add(&bh->b_assoc_buffers, &tmp); + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(lock); + /* + * Ensure any pending I/O completes so that + * ll_rw_block() actually writes the current + * contents - it is a noop if I/O is still in + * flight on potentially older contents. + */ + wait_on_buffer(bh); + ll_rw_block(WRITE, 1, &bh); + brelse(bh); + spin_lock(lock); + } + } + } + + while (!list_empty(&tmp)) { + bh = BH_ENTRY(tmp.prev); + __remove_assoc_queue(bh); + get_bh(bh); + spin_unlock(lock); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + err = -EIO; + brelse(bh); + spin_lock(lock); + } + + spin_unlock(lock); + err2 = osync_buffers_list(lock, list); + if (err) + return err; + else + return err2; +} + +/* + * Invalidate any and all dirty buffers on a given inode. We are + * probably unmounting the fs, but that doesn't mean we have already + * done a sync(). Just drop the buffers from the inode list. + * + * NOTE: we take the inode's blockdev's mapping's private_lock. Which + * assumes that all the buffers are against the blockdev. Not true + * for reiserfs. + */ +void invalidate_inode_buffers(struct inode *inode) +{ + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) + __remove_assoc_queue(BH_ENTRY(list->next)); + spin_unlock(&buffer_mapping->private_lock); + } +} + +/* + * Remove any clean buffers from the inode's buffer list. This is called + * when we're trying to free the inode itself. Those buffers can pin it. + * + * Returns true if all buffers were removed. + */ +int remove_inode_buffers(struct inode *inode) +{ + int ret = 1; + + if (inode_has_buffers(inode)) { + struct address_space *mapping = &inode->i_data; + struct list_head *list = &mapping->private_list; + struct address_space *buffer_mapping = mapping->assoc_mapping; + + spin_lock(&buffer_mapping->private_lock); + while (!list_empty(list)) { + struct buffer_head *bh = BH_ENTRY(list->next); + if (buffer_dirty(bh)) { + ret = 0; + break; + } + __remove_assoc_queue(bh); + } + spin_unlock(&buffer_mapping->private_lock); + } + return ret; +} + +/* + * Create the appropriate buffers when given a page for data area and + * the size of each buffer.. Use the bh->b_this_page linked list to + * follow the buffers created. Return NULL if unable to create more + * buffers. + * + * The retry flag is used to differentiate async IO (paging, swapping) + * which may not fail from ordinary buffer allocations. + */ +static struct buffer_head * +create_buffers(struct page * page, unsigned long size, int retry) +{ + struct buffer_head *bh, *head; + long offset; + +try_again: + head = NULL; + offset = PAGE_SIZE; + while ((offset -= size) >= 0) { + bh = alloc_buffer_head(GFP_NOFS); + if (!bh) + goto no_grow; + + bh->b_bdev = NULL; + bh->b_this_page = head; + bh->b_blocknr = -1; + head = bh; + + bh->b_state = 0; + atomic_set(&bh->b_count, 0); + bh->b_size = size; + + /* Link the buffer to its page */ + set_bh_page(bh, page, offset); + + bh->b_end_io = NULL; + } + return head; +/* + * In case anything failed, we just free everything we got. + */ +no_grow: + if (head) { + do { + bh = head; + head = head->b_this_page; + free_buffer_head(bh); + } while (head); + } + + /* + * Return failure for non-async IO requests. Async IO requests + * are not allowed to fail, so we have to wait until buffer heads + * become available. But we don't want tasks sleeping with + * partially complete buffers, so all were released above. + */ + if (!retry) + return NULL; + + /* We're _really_ low on memory. Now we just + * wait for old buffer heads to become free due to + * finishing IO. Since this is an async request and + * the reserve list is empty, we're sure there are + * async buffer heads in use. + */ + free_more_memory(); + goto try_again; +} + +static inline void +link_dev_buffers(struct page *page, struct buffer_head *head) +{ + struct buffer_head *bh, *tail; + + bh = head; + do { + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + __set_page_buffers(page, head); +} + +/* + * Initialise the state of a blockdev page's buffers. + */ +static void +init_page_buffers(struct page *page, struct block_device *bdev, + int block, int size) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh = head; + unsigned int b_state; + + b_state = 1 << BH_Mapped; + if (PageUptodate(page)) + b_state |= 1 << BH_Uptodate; + + do { + if (!(bh->b_state & (1 << BH_Mapped))) { + init_buffer(bh, NULL, NULL); + bh->b_bdev = bdev; + bh->b_blocknr = block; + bh->b_state = b_state; + } + block++; + bh = bh->b_this_page; + } while (bh != head); +} + +/* + * Create the page-cache page that contains the requested block. + * + * This is user purely for blockdev mappings. + */ +static struct page * +grow_dev_page(struct block_device *bdev, unsigned long block, + unsigned long index, int size) +{ + struct inode *inode = bdev->bd_inode; + struct page *page; + struct buffer_head *bh; + + page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + if (!page) + return NULL; + + if (!PageLocked(page)) + BUG(); + + if (page_has_buffers(page)) { + bh = page_buffers(page); + if (bh->b_size == size) + return page; + if (!try_to_free_buffers(page)) + goto failed; + } + + /* + * Allocate some buffers for this page + */ + bh = create_buffers(page, size, 0); + if (!bh) + goto failed; + + /* + * Link the page to the buffers and initialise them. Take the + * lock to be atomic wrt __find_get_block(), which does not + * run under the page lock. + */ + spin_lock(&inode->i_mapping->private_lock); + link_dev_buffers(page, bh); + init_page_buffers(page, bdev, block, size); + spin_unlock(&inode->i_mapping->private_lock); + return page; + +failed: + buffer_error(); + unlock_page(page); + page_cache_release(page); + return NULL; +} + +/* + * Create buffers for the specified block device block's page. If + * that page was dirty, the buffers are set dirty also. + * + * Except that's a bug. Attaching dirty buffers to a dirty + * blockdev's page can result in filesystem corruption, because + * some of those buffers may be aliases of filesystem data. + * grow_dev_page() will go BUG() if this happens. + */ +static inline int +grow_buffers(struct block_device *bdev, unsigned long block, int size) +{ + struct page *page; + unsigned long index; + int sizebits; + + /* Size must be multiple of hard sectorsize */ + if (size & (bdev_hardsect_size(bdev)-1)) + BUG(); + if (size < 512 || size > PAGE_SIZE) + BUG(); + + sizebits = -1; + do { + sizebits++; + } while ((size << sizebits) < PAGE_SIZE); + + index = block >> sizebits; + block = index << sizebits; + + /* Create a page with the proper size buffers.. */ + page = grow_dev_page(bdev, block, index, size); + if (!page) + return 0; + unlock_page(page); + page_cache_release(page); + return 1; +} + +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, int size) +{ + for (;;) { + struct buffer_head * bh; + + bh = __find_get_block(bdev, block, size); + if (bh) + return bh; + + if (!grow_buffers(bdev, block, size)) + free_more_memory(); + } +} + +/* + * The relationship between dirty buffers and dirty pages: + * + * Whenever a page has any dirty buffers, the page's dirty bit is set, and + * the page appears on its address_space.dirty_pages list. + * + * At all times, the dirtiness of the buffers represents the dirtiness of + * subsections of the page. If the page has buffers, the page dirty bit is + * merely a hint about the true dirty state. + * + * When a page is set dirty in its entirety, all its buffers are marked dirty + * (if the page has buffers). + * + * When a buffer is marked dirty, its page is dirtied, but the page's other + * buffers are not. + * + * Also. When blockdev buffers are explicitly read with bread(), they + * individually become uptodate. But their backing page remains not + * uptodate - even if all of its buffers are uptodate. A subsequent + * block_read_full_page() against that page will discover all the uptodate + * buffers, will set the page uptodate and will perform no I/O. + */ + +/** + * mark_buffer_dirty - mark a buffer_head as needing writeout + * + * mark_buffer_dirty() will set the dirty bit against the buffer, + * then set its backing page dirty, then attach the page to its + * address_space's dirty_pages list and then attach the address_space's + * inode to its superblock's dirty inode list. + * + * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mapping->page_lock and the global inode_lock. + */ +void mark_buffer_dirty(struct buffer_head *bh) +{ + if (!buffer_uptodate(bh)) + buffer_error(); + if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) + __set_page_dirty_nobuffers(bh->b_page); +} + +/* + * Decrement a buffer_head's reference count. If all buffers against a page + * have zero reference count, are clean and unlocked, and if the page is clean + * and unlocked then try_to_free_buffers() may strip the buffers from the page + * in preparation for freeing it (sometimes, rarely, buffers are removed from + * a page but it ends up not being freed, and buffers may later be reattached). + */ +void __brelse(struct buffer_head * buf) +{ + if (atomic_read(&buf->b_count)) { + put_bh(buf); + return; + } + printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n"); + buffer_error(); /* For the stack backtrace */ +} + +/* + * bforget() is like brelse(), except it discards any + * potentially dirty data. + */ +void __bforget(struct buffer_head *bh) +{ + clear_buffer_dirty(bh); + if (!list_empty(&bh->b_assoc_buffers)) { + struct address_space *buffer_mapping = bh->b_page->mapping; + + spin_lock(&buffer_mapping->private_lock); + list_del_init(&bh->b_assoc_buffers); + spin_unlock(&buffer_mapping->private_lock); + } + __brelse(bh); +} + +static struct buffer_head *__bread_slow(struct buffer_head *bh) +{ + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } else { + if (buffer_dirty(bh)) + buffer_error(); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) + return bh; + } + brelse(bh); + return NULL; +} + +/* + * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). + * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their + * refcount elevated by one when they're in an LRU. A buffer can only appear + * once in a particular CPU's LRU. A single buffer can be present in multiple + * CPU's LRUs at the same time. + * + * This is a transparent caching front-end to sb_bread(), sb_getblk() and + * sb_find_get_block(). + * + * The LRUs themselves only need locking against invalidate_bh_lrus. We use + * a local interrupt disable for that. + */ + +#define BH_LRU_SIZE 8 + +struct bh_lru { + struct buffer_head *bhs[BH_LRU_SIZE]; +}; + +static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{0}}; + +#ifdef CONFIG_SMP +#define bh_lru_lock() local_irq_disable() +#define bh_lru_unlock() local_irq_enable() +#else +#define bh_lru_lock() preempt_disable() +#define bh_lru_unlock() preempt_enable() +#endif + +static inline void check_irqs_on(void) +{ +#ifdef irqs_disabled + BUG_ON(irqs_disabled()); +#endif +} + +/* + * The LRU management algorithm is dopey-but-simple. Sorry. + */ +static void bh_lru_install(struct buffer_head *bh) +{ + struct buffer_head *evictee = NULL; + struct bh_lru *lru; + + check_irqs_on(); + bh_lru_lock(); + lru = &__get_cpu_var(bh_lrus); + if (lru->bhs[0] != bh) { + struct buffer_head *bhs[BH_LRU_SIZE]; + int in; + int out = 0; + + get_bh(bh); + bhs[out++] = bh; + for (in = 0; in < BH_LRU_SIZE; in++) { + struct buffer_head *bh2 = lru->bhs[in]; + + if (bh2 == bh) { + __brelse(bh2); + } else { + if (out >= BH_LRU_SIZE) { + BUG_ON(evictee != NULL); + evictee = bh2; + } else { + bhs[out++] = bh2; + } + } + } + while (out < BH_LRU_SIZE) + bhs[out++] = NULL; + memcpy(lru->bhs, bhs, sizeof(bhs)); + } + bh_lru_unlock(); + + if (evictee) + __brelse(evictee); +} + +/* + * Look up the bh in this cpu's LRU. If it's there, move it to the head. + */ +static inline struct buffer_head * +lookup_bh_lru(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *ret = NULL; + struct bh_lru *lru; + int i; + + check_irqs_on(); + bh_lru_lock(); + lru = &__get_cpu_var(bh_lrus); + for (i = 0; i < BH_LRU_SIZE; i++) { + struct buffer_head *bh = lru->bhs[i]; + + if (bh && bh->b_bdev == bdev && + bh->b_blocknr == block && bh->b_size == size) { + if (i) { + while (i) { + lru->bhs[i] = lru->bhs[i - 1]; + i--; + } + lru->bhs[0] = bh; + } + get_bh(bh); + ret = bh; + break; + } + } + bh_lru_unlock(); + return ret; +} + +/* + * Perform a pagecache lookup for the matching buffer. If it's there, refresh + * it in the LRU and mark it as accessed. If it is not present then return + * NULL + */ +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *bh = lookup_bh_lru(bdev, block, size); + + if (bh == NULL) { + bh = __find_get_block_slow(bdev, block, size); + if (bh) + bh_lru_install(bh); + } + if (bh) + touch_buffer(bh); + return bh; +} +EXPORT_SYMBOL(__find_get_block); + +/* + * __getblk will locate (and, if necessary, create) the buffer_head + * which corresponds to the passed block_device, block and size. The + * returned buffer has its reference count incremented. + * + * __getblk() cannot fail - it just keeps trying. If you pass it an + * illegal block number, __getblk() will happily return a buffer_head + * which represents the non-existent block. Very weird. + * + * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() + * attempt is failing. FIXME, perhaps? + */ +struct buffer_head * +__getblk(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *bh = __find_get_block(bdev, block, size); + + if (bh == NULL) + bh = __getblk_slow(bdev, block, size); + return bh; +} +EXPORT_SYMBOL(__getblk); + +/* + * Do async read-ahead on a buffer.. + */ +void __breadahead(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + ll_rw_block(READA, 1, &bh); + brelse(bh); +} +EXPORT_SYMBOL(__breadahead); + +/** + * __bread() - reads a specified block and returns the bh + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that contains it. + * It returns NULL if the block was unreadable. + */ +struct buffer_head * +__bread(struct block_device *bdev, sector_t block, int size) +{ + struct buffer_head *bh = __getblk(bdev, block, size); + + if (!buffer_uptodate(bh)) + bh = __bread_slow(bh); + return bh; +} +EXPORT_SYMBOL(__bread); + +/* + * invalidate_bh_lrus() is called rarely - at unmount. Because it is only for + * unmount it only needs to ensure that all buffers from the target device are + * invalidated on return and it doesn't need to worry about new buffers from + * that device being added - the unmount code has to prevent that. + */ +static void invalidate_bh_lru(void *arg) +{ + struct bh_lru *b = &get_cpu_var(bh_lrus); + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } + put_cpu_var(bh_lrus); +} + +static void invalidate_bh_lrus(void) +{ + on_each_cpu(invalidate_bh_lru, NULL, 1, 1); +} + +void set_bh_page(struct buffer_head *bh, + struct page *page, unsigned long offset) +{ + bh->b_page = page; + if (offset >= PAGE_SIZE) + BUG(); + if (PageHighMem(page)) + /* + * This catches illegal uses and preserves the offset: + */ + bh->b_data = (char *)(0 + offset); + else + bh->b_data = page_address(page) + offset; +} +EXPORT_SYMBOL(set_bh_page); + +/* + * Called when truncating a buffer on a page completely. + */ +static inline void discard_buffer(struct buffer_head * bh) +{ + lock_buffer(bh); + clear_buffer_dirty(bh); + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + clear_buffer_delay(bh); + unlock_buffer(bh); +} + +/** + * try_to_release_page() - release old fs-specific metadata on a page + * + * @page: the page which the kernel is trying to free + * @gfp_mask: memory allocation flags (and I/O mode) + * + * The address_space is to try to release any data against the page + * (presumably at page->private). If the release was successful, return `1'. + * Otherwise return zero. + * + * The @gfp_mask argument specifies whether I/O may be performed to release + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * + * NOTE: @gfp_mask may go away, and this function may become non-blocking. + */ +int try_to_release_page(struct page *page, int gfp_mask) +{ + struct address_space * const mapping = page->mapping; + + if (!PageLocked(page)) + BUG(); + if (PageWriteback(page)) + return 0; + + if (mapping && mapping->a_ops->releasepage) + return mapping->a_ops->releasepage(page, gfp_mask); + return try_to_free_buffers(page); +} + +/** + * block_invalidatepage - invalidate part of all of a buffer-backed page + * + * @page: the page which is affected + * @offset: the index of the truncation point + * + * block_invalidatepage() is called when all or part of the page has become + * invalidatedby a truncate operation. + * + * block_invalidatepage() does not have to release all buffers, but it must + * ensure that no dirty buffer is left outside @offset and that no I/O + * is underway against any of the blocks which are outside the truncation + * point. Because the caller is about to free (and possibly reuse) those + * blocks on-disk. + */ +int block_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + int ret = 1; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) + discard_buffer(bh); + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (offset == 0) + ret = try_to_release_page(page, 0); +out: + return ret; +} +EXPORT_SYMBOL(block_invalidatepage); + +/* + * We attach and possibly dirty the buffers atomically wrt + * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * is already excluded via the page lock. + */ +void create_empty_buffers(struct page *page, + unsigned long blocksize, unsigned long b_state) +{ + struct buffer_head *bh, *head, *tail; + + head = create_buffers(page, blocksize, 1); + bh = head; + do { + bh->b_state |= b_state; + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + + spin_lock(&page->mapping->private_lock); + if (PageUptodate(page) || PageDirty(page)) { + bh = head; + do { + if (PageDirty(page)) + set_buffer_dirty(bh); + if (PageUptodate(page)) + set_buffer_uptodate(bh); + bh = bh->b_this_page; + } while (bh != head); + } + __set_page_buffers(page, head); + spin_unlock(&page->mapping->private_lock); +} +EXPORT_SYMBOL(create_empty_buffers); + +/* + * We are taking a block for data and we don't want any output from any + * buffer-cache aliases starting from return from that function and + * until the moment when something will explicitly mark the buffer + * dirty (hopefully that will not happen until we will free that block ;-) + * We don't even need to mark it not-uptodate - nobody can expect + * anything from a newly allocated buffer anyway. We used to used + * unmap_buffer() for such invalidation, but that was wrong. We definitely + * don't want to mark the alias unmapped, for example - it would confuse + * anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can + * be writeout I/O going on against recently-freed buffers. We don't + * wait on that I/O in bforget() - it's more efficient to wait on the I/O + * only if we really need to. That happens here. + */ +void unmap_underlying_metadata(struct block_device *bdev, sector_t block) +{ + struct buffer_head *old_bh; + + old_bh = __find_get_block_slow(bdev, block, 0); + if (old_bh) { +#if 0 /* This happens. Later. */ + if (buffer_dirty(old_bh)) + buffer_error(); +#endif + clear_buffer_dirty(old_bh); + wait_on_buffer(old_bh); + clear_buffer_req(old_bh); + __brelse(old_bh); + } +} +EXPORT_SYMBOL(unmap_underlying_metadata); + +/* + * NOTE! All mapped/uptodate combinations are valid: + * + * Mapped Uptodate Meaning + * + * No No "unknown" - must do get_block() + * No Yes "hole" - zero-filled + * Yes No "allocated" - allocated on disk, not read in + * Yes Yes "valid" - allocated and up-to-date in memory. + * + * "Dirty" is valid only with the last case (mapped+uptodate). + */ + +/* + * While block_write_full_page is writing back the dirty buffers under + * the page lock, whoever dirtied the buffers may decide to clean them + * again at any time. We handle that by only looking at the buffer + * state inside lock_buffer(). + * + * If block_write_full_page() is called for regular writeback + * (called_for_sync() is false) then it will redirty a page which has a locked + * buffer. This only can happen if someone has written the buffer directly, + * with submit_bh(). At the address_space level PageWriteback prevents this + * contention from occurring. + */ +static int __block_write_full_page(struct inode *inode, struct page *page, + get_block_t *get_block, struct writeback_control *wbc) +{ + int err; + unsigned long block; + unsigned long last_block; + struct buffer_head *bh, *head; + int nr_underway = 0; + + BUG_ON(!PageLocked(page)); + + last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; + + if (!page_has_buffers(page)) { + if (!PageUptodate(page)) + buffer_error(); + create_empty_buffers(page, 1 << inode->i_blkbits, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + + /* + * Be very careful. We have no exclusion from __set_page_dirty_buffers + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * handle that here by just cleaning them. + */ + + block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + head = page_buffers(page); + bh = head; + + /* + * Get all the dirty buffers mapped to disk addresses and + * handle any aliases from the underlying blockdev's mapping. + */ + do { + if (block > last_block) { + /* + * mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. + * + * if (buffer_mapped(bh)) + * buffer_error(); + */ + /* + * The buffer was zeroed by block_write_full_page() + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { + if (buffer_new(bh)) + buffer_error(); + err = get_block(inode, block, bh, 1); + if (err) + goto recover; + if (buffer_new(bh)) { + /* blockdev mappings never come here */ + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + } + } + bh = bh->b_this_page; + block++; + } while (bh != head); + + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh)) { + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else { + if (test_set_buffer_locked(bh)) { + __set_page_dirty_nobuffers(page); + continue; + } + } + if (test_clear_buffer_dirty(bh)) { + if (!buffer_uptodate(bh)) + buffer_error(); + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } + } while ((bh = bh->b_this_page) != head); + + BUG_ON(PageWriteback(page)); + SetPageWriteback(page); /* Keeps try_to_free_buffers() away */ + unlock_page(page); + + /* + * The page may come unlocked any time after the *first* submit_bh() + * call. Be careful with its buffers. + */ + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(WRITE, bh); + nr_underway++; + } + put_bh(bh); + bh = next; + } while (bh != head); + + err = 0; +done: + if (nr_underway == 0) { + /* + * The page was marked dirty, but the buffers were + * clean. Someone wrote them back by hand with + * ll_rw_block/submit_bh. A rare case. + */ + int uptodate = 1; + do { + if (!buffer_uptodate(bh)) { + uptodate = 0; + break; + } + bh = bh->b_this_page; + } while (bh != head); + if (uptodate) + SetPageUptodate(page); + end_page_writeback(page); + } + return err; + +recover: + /* + * ENOSPC, or some other error. We may already have added some + * blocks to the file, so we need to write these out to avoid + * exposing stale data. + * The page is currently locked and not marked for writeback + */ + bh = head; + /* Recovery: lock and submit the mapped buffers */ + do { + get_bh(bh); + if (buffer_mapped(bh) && buffer_dirty(bh)) { + lock_buffer(bh); + mark_buffer_async_write(bh); + } else { + /* + * The buffer may have been set dirty during + * attachment to a dirty page. + */ + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + SetPageError(page); + BUG_ON(PageWriteback(page)); + SetPageWriteback(page); + unlock_page(page); + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + clear_buffer_dirty(bh); + submit_bh(WRITE, bh); + nr_underway++; + } + put_bh(bh); + bh = next; + } while (bh != head); + goto done; +} + +static int __block_prepare_write(struct inode *inode, struct page *page, + unsigned from, unsigned to, get_block_t *get_block) +{ + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize, bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + + bbits = inode->i_blkbits; + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for(bh = head, block_start = 0; bh != head || !block_start; + block++, block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + err = get_block(inode, block, bh, 1); + if (err) + goto out; + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + if (!buffer_mapped(bh)) + buffer_error(); + set_buffer_uptodate(bh); + continue; + } + if (block_end > to || block_start < from) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_end > to) + memset(kaddr+to, 0, + block_end-to); + if (block_start < from) + memset(kaddr+block_start, + 0, from-block_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++=bh; + } + } + /* + * If we issued read requests - let them complete. + */ + while(wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + return -EIO; + } + return 0; +out: + /* + * Zero out any newly allocated blocks to avoid exposing stale + * data. If BH_New is set, we know that the block was newly + * allocated in the above loop. + */ + bh = head; + block_start = 0; + do { + block_end = block_start+blocksize; + if (block_end <= from) + goto next_bh; + if (block_start >= to) + break; + if (buffer_new(bh)) { + void *kaddr; + + clear_buffer_new(bh); + if (buffer_uptodate(bh)) + buffer_error(); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+block_start, 0, bh->b_size); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } +next_bh: + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); + return err; +} + +static int __block_commit_write(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + + blocksize = 1 << inode->i_blkbits; + + for(bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + } + + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} + +/* + * Generic "read page" function for block devices that have the normal + * get_block functionality. This is most of the block device filesystems. + * Reads the page asynchronously --- the unlock_buffer() and + * set/clear_buffer_uptodate() functions propagate buffer state into the + * page struct once IO has completed. + */ +int block_read_full_page(struct page *page, get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + sector_t iblock, lblock; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + unsigned int blocksize; + int nr, i; + int fully_mapped = 1; + + if (!PageLocked(page)) + PAGE_BUG(page); + if (PageUptodate(page)) + buffer_error(); + blocksize = 1 << inode->i_blkbits; + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + + iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits; + bh = head; + nr = 0; + i = 0; + + do { + if (buffer_uptodate(bh)) + continue; + + if (!buffer_mapped(bh)) { + fully_mapped = 0; + if (iblock < lblock) { + if (get_block(inode, iblock, bh, 0)) + SetPageError(page); + } + if (!buffer_mapped(bh)) { + void *kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + i * blocksize, 0, blocksize); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + continue; + } + /* + * get_block() might have updated the buffer + * synchronously + */ + if (buffer_uptodate(bh)) + continue; + } + arr[nr++] = bh; + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + if (fully_mapped) + SetPageMappedToDisk(page); + + if (!nr) { + /* + * All buffers are uptodate - we can set the page uptodate + * as well. But not if get_block() returned an error. + */ + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + /* Stage two: lock the buffers */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + lock_buffer(bh); + mark_buffer_async_read(bh); + } + + /* + * Stage 3: start the IO. Check for uptodateness + * inside the buffer lock in case another process reading + * the underlying blockdev brought it uptodate (the sct fix). + */ + for (i = 0; i < nr; i++) { + bh = arr[i]; + if (buffer_uptodate(bh)) + end_buffer_async_read(bh, 1); + else + submit_bh(READ, bh); + } + return 0; +} + +/* utility function for filesystems that need to do work on expanding + * truncates. Uses prepare/commit_write to allow the filesystem to + * deal with the hole. + */ +int generic_cont_expand(struct inode *inode, loff_t size) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long index, offset, limit; + int err; + + err = -EFBIG; + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && size > (loff_t)limit) { + send_sig(SIGXFSZ, current, 0); + goto out; + } + if (size > inode->i_sb->s_maxbytes) + goto out; + + offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ + + /* ugh. in prepare/commit_write, if from==to==start of block, we + ** skip the prepare. make sure we never send an offset for the start + ** of a block + */ + if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { + offset++; + } + index = size >> PAGE_CACHE_SHIFT; + err = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + err = mapping->a_ops->prepare_write(NULL, page, offset, offset); + if (!err) { + err = mapping->a_ops->commit_write(NULL, page, offset, offset); + } + unlock_page(page); + page_cache_release(page); + if (err > 0) + err = 0; +out: + return err; +} + +/* + * For moronic filesystems that do not allow holes in file. + * We may have to extend the file. + */ + +int cont_prepare_write(struct page *page, unsigned offset, + unsigned to, get_block_t *get_block, loff_t *bytes) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct page *new_page; + unsigned long pgpos; + long status; + unsigned zerofrom; + unsigned blocksize = 1 << inode->i_blkbits; + void *kaddr; + + while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { + status = -ENOMEM; + new_page = grab_cache_page(mapping, pgpos); + if (!new_page) + goto out; + /* we might sleep */ + if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { + unlock_page(new_page); + page_cache_release(new_page); + continue; + } + zerofrom = *bytes & ~PAGE_CACHE_MASK; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + status = __block_prepare_write(inode, new_page, zerofrom, + PAGE_CACHE_SIZE, get_block); + if (status) + goto out_unmap; + kaddr = kmap_atomic(new_page, KM_USER0); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + flush_dcache_page(new_page); + kunmap_atomic(kaddr, KM_USER0); + __block_commit_write(inode, new_page, + zerofrom, PAGE_CACHE_SIZE); + unlock_page(new_page); + page_cache_release(new_page); + } + + if (page->index < pgpos) { + /* completely inside the area */ + zerofrom = offset; + } else { + /* page covers the boundary, find the boundary offset */ + zerofrom = *bytes & ~PAGE_CACHE_MASK; + + /* if we will expand the thing last block will be filled */ + if (to > zerofrom && (zerofrom & (blocksize-1))) { + *bytes |= (blocksize-1); + (*bytes)++; + } + + /* starting below the boundary? Nothing to zero out */ + if (offset <= zerofrom) + zerofrom = offset; + } + status = __block_prepare_write(inode, page, zerofrom, to, get_block); + if (status) + goto out1; + if (zerofrom < offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+zerofrom, 0, offset-zerofrom); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + __block_commit_write(inode, page, zerofrom, offset); + } + return 0; +out1: + ClearPageUptodate(page); + return status; + +out_unmap: + ClearPageUptodate(new_page); + unlock_page(new_page); + page_cache_release(new_page); +out: + return status; +} + +int block_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + int err = __block_prepare_write(inode, page, from, to, get_block); + if (err) + ClearPageUptodate(page); + return err; +} + +int block_commit_write(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + __block_commit_write(inode,page,from,to); + return 0; +} + +int generic_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + __block_commit_write(inode,page,from,to); + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_sem. + */ + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} + +/* + * On entry, the page is fully not uptodate. + * On exit the page is fully uptodate in the areas outside (from,to) + */ +int nobh_prepare_write(struct page *page, unsigned from, unsigned to, + get_block_t *get_block) +{ + struct inode *inode = page->mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + struct buffer_head map_bh; + struct buffer_head *read_bh[MAX_BUF_PER_PAGE]; + unsigned block_in_page; + unsigned block_start; + sector_t block_in_file; + char *kaddr; + int nr_reads = 0; + int i; + int ret = 0; + int is_mapped_to_disk = 1; + int dirtied_it = 0; + + if (PageMappedToDisk(page)) + return 0; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + map_bh.b_page = page; + + /* + * We loop across all blocks in the page, whether or not they are + * part of the affected region. This is so we can discover if the + * page is fully mapped-to-disk. + */ + for (block_start = 0, block_in_page = 0; + block_start < PAGE_CACHE_SIZE; + block_in_page++, block_start += blocksize) { + unsigned block_end = block_start + blocksize; + int create; + + map_bh.b_state = 0; + create = 1; + if (block_start >= to) + create = 0; + ret = get_block(inode, block_in_file + block_in_page, + &map_bh, create); + if (ret) + goto failed; + if (!buffer_mapped(&map_bh)) + is_mapped_to_disk = 0; + if (buffer_new(&map_bh)) + unmap_underlying_metadata(map_bh.b_bdev, + map_bh.b_blocknr); + if (PageUptodate(page)) + continue; + if (buffer_new(&map_bh) || !buffer_mapped(&map_bh)) { + kaddr = kmap_atomic(page, KM_USER0); + if (block_start < from) { + memset(kaddr+block_start, 0, from-block_start); + dirtied_it = 1; + } + if (block_end > to) { + memset(kaddr + to, 0, block_end - to); + dirtied_it = 1; + } + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + continue; + } + if (buffer_uptodate(&map_bh)) + continue; /* reiserfs does this */ + if (block_start < from || block_end > to) { + struct buffer_head *bh = alloc_buffer_head(GFP_NOFS); + + if (!bh) { + ret = -ENOMEM; + goto failed; + } + bh->b_state = map_bh.b_state; + atomic_set(&bh->b_count, 0); + bh->b_this_page = 0; + bh->b_page = page; + bh->b_blocknr = map_bh.b_blocknr; + bh->b_size = blocksize; + bh->b_data = (char *)(long)block_start; + bh->b_bdev = map_bh.b_bdev; + bh->b_private = NULL; + read_bh[nr_reads++] = bh; + } + } + + if (nr_reads) { + ll_rw_block(READ, nr_reads, read_bh); + for (i = 0; i < nr_reads; i++) { + wait_on_buffer(read_bh[i]); + if (!buffer_uptodate(read_bh[i])) + ret = -EIO; + free_buffer_head(read_bh[i]); + read_bh[i] = NULL; + } + if (ret) + goto failed; + } + + if (is_mapped_to_disk) + SetPageMappedToDisk(page); + SetPageUptodate(page); + + /* + * Setting the page dirty here isn't necessary for the prepare_write + * function - commit_write will do that. But if/when this function is + * used within the pagefault handler to ensure that all mmapped pages + * have backing space in the filesystem, we will need to dirty the page + * if its contents were altered. + */ + if (dirtied_it) + set_page_dirty(page); + + return 0; + +failed: + for (i = 0; i < nr_reads; i++) { + if (read_bh[i]) + free_buffer_head(read_bh[i]); + } + + /* + * Error recovery is pretty slack. Clear the page and mark it dirty + * so we'll later zero out any blocks which _were_ allocated. + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + SetPageUptodate(page); + set_page_dirty(page); + return ret; +} +EXPORT_SYMBOL(nobh_prepare_write); + +int nobh_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + set_page_dirty(page); + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} +EXPORT_SYMBOL(nobh_commit_write); + +/* + * This function assumes that ->prepare_write() uses nobh_prepare_write(). + */ +int nobh_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + unsigned blocksize = 1 << inode->i_blkbits; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned to; + struct page *page; + struct address_space_operations *a_ops = mapping->a_ops; + char *kaddr; + int ret = 0; + + if ((offset & (blocksize - 1)) == 0) + goto out; + + ret = -ENOMEM; + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + to = (offset + blocksize) & ~(blocksize - 1); + ret = a_ops->prepare_write(NULL, page, offset, to); + if (ret == 0) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + } + unlock_page(page); + page_cache_release(page); +out: + return ret; +} +EXPORT_SYMBOL(nobh_truncate_page); + +int block_truncate_page(struct address_space *mapping, + loff_t from, get_block_t *get_block) +{ + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + void *kaddr; + int err; + + blocksize = 1 << inode->i_blkbits; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + /* Find the buffer that contains "offset" */ + bh = page_buffers(page); + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + err = get_block(inode, iblock, bh, 0); + if (err) + goto unlock; + /* unmapped? It's a hole - nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh) && !buffer_delay(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(bh); + err = 0; + +unlock: + unlock_page(page); + page_cache_release(page); +out: + return err; +} + +/* + * The generic ->writepage function for buffer-backed address_spaces + */ +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) +{ + struct inode * const inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + const unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + unsigned offset; + void *kaddr; + + /* Is the page fully inside i_size? */ + if (page->index < end_index) + return __block_write_full_page(inode, page, get_block, wbc); + + /* Is the page fully outside i_size? (truncate in progress) */ + offset = i_size & (PAGE_CACHE_SIZE-1); + if (page->index >= end_index+1 || !offset) { + /* + * The page may have dirty, unmapped buffers. For example, + * they may have been added in ext3_writepage(). Make them + * freeable here, so the page does not leak. + */ + block_invalidatepage(page, 0); + unlock_page(page); + return 0; /* don't care */ + } + + /* + * The page straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + return __block_write_full_page(inode, page, get_block, wbc); +} + +sector_t generic_block_bmap(struct address_space *mapping, sector_t block, + get_block_t *get_block) +{ + struct buffer_head tmp; + struct inode *inode = mapping->host; + tmp.b_state = 0; + tmp.b_blocknr = 0; + get_block(inode, block, &tmp, 0); + return tmp.b_blocknr; +} + +static int end_bio_bh_io_sync(struct bio *bio, unsigned int bytes_done, int err) +{ + struct buffer_head *bh = bio->bi_private; + + if (bio->bi_size) + return 1; + + bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags)); + bio_put(bio); + return 0; +} + +int submit_bh(int rw, struct buffer_head * bh) +{ + struct bio *bio; + + BUG_ON(!buffer_locked(bh)); + BUG_ON(!buffer_mapped(bh)); + BUG_ON(!bh->b_end_io); + + if ((rw == READ || rw == READA) && buffer_uptodate(bh)) + buffer_error(); + if (rw == WRITE && !buffer_uptodate(bh)) + buffer_error(); + if (rw == READ && buffer_dirty(bh)) + buffer_error(); + + /* Only clear out a write error when rewriting */ + if (test_set_buffer_req(bh) && rw == WRITE) + clear_buffer_write_io_error(bh); + + /* + * from here on down, it's all bio -- do the initial mapping, + * submit_bio -> generic_make_request may further map this bio around + */ + bio = bio_alloc(GFP_NOIO, 1); + + bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + bio->bi_io_vec[0].bv_page = bh->b_page; + bio->bi_io_vec[0].bv_len = bh->b_size; + bio->bi_io_vec[0].bv_offset = bh_offset(bh); + + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = bh->b_size; + + bio->bi_end_io = end_bio_bh_io_sync; + bio->bi_private = bh; + + return submit_bio(rw, bio); +} + +/** + * ll_rw_block: low-level access to block devices (DEPRECATED) + * @rw: whether to %READ or %WRITE or maybe %READA (readahead) + * @nr: number of &struct buffer_heads in the array + * @bhs: array of pointers to &struct buffer_head + * + * ll_rw_block() takes an array of pointers to &struct buffer_heads, + * and requests an I/O operation on them, either a %READ or a %WRITE. + * The third %READA option is described in the documentation for + * generic_make_request() which ll_rw_block() calls. + * + * This function drops any buffer that it cannot get a lock on (with the + * BH_Lock state bit), any buffer that appears to be clean when doing a + * write request, and any buffer that appears to be up-to-date when doing + * read request. Further it marks as clean buffers that are processed for + * writing (the buffer cache won't assume that they are actually clean until + * the buffer gets unlocked). + * + * ll_rw_block sets b_end_io to simple completion handler that marks + * the buffer up-to-date (if approriate), unlocks the buffer and wakes + * any waiters. + * + * All of the buffers must be for the same device, and must also be a + * multiple of the current approved size for the device. + */ +void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) +{ + int i; + + for (i = 0; i < nr; i++) { + struct buffer_head *bh = bhs[i]; + + if (test_set_buffer_locked(bh)) + continue; + + get_bh(bh); + if (rw == WRITE) { + bh->b_end_io = end_buffer_write_sync; + if (test_clear_buffer_dirty(bh)) { + submit_bh(WRITE, bh); + continue; + } + } else { + bh->b_end_io = end_buffer_read_sync; + if (!buffer_uptodate(bh)) { + submit_bh(rw, bh); + continue; + } + } + unlock_buffer(bh); + put_bh(bh); + } +} + +/* + * For a data-integrity writeout, we need to wait upon any in-progress I/O + * and then start new I/O and then wait upon it. + */ +void sync_dirty_buffer(struct buffer_head *bh) +{ + WARN_ON(atomic_read(&bh->b_count) < 1); + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + get_bh(bh); + bh->b_end_io = end_buffer_write_sync; + submit_bh(WRITE, bh); + wait_on_buffer(bh); + } else { + unlock_buffer(bh); + } +} + +/* + * Sanity checks for try_to_free_buffers. + */ +static void check_ttfb_buffer(struct page *page, struct buffer_head *bh) +{ + if (!buffer_uptodate(bh) && !buffer_req(bh)) { + if (PageUptodate(page) && page->mapping + && buffer_mapped(bh) /* discard_buffer */ + && S_ISBLK(page->mapping->host->i_mode)) + { + buffer_error(); + } + } +} + +/* + * try_to_free_buffers() checks if all the buffers on this particular page + * are unused, and releases them if so. + * + * Exclusion against try_to_free_buffers may be obtained by either + * locking the page or by holding its mapping's private_lock. + * + * If the page is dirty but all the buffers are clean then we need to + * be sure to mark the page clean as well. This is because the page + * may be against a block device, and a later reattachment of buffers + * to a dirty page will set *all* buffers dirty. Which would corrupt + * filesystem data on the same device. + * + * The same applies to regular filesystem pages: if all the buffers are + * clean then we set the page clean and proceed. To do that, we require + * total exclusion from __set_page_dirty_buffers(). That is obtained with + * private_lock. + * + * try_to_free_buffers() is non-blocking. + */ +static inline int buffer_busy(struct buffer_head *bh) +{ + return atomic_read(&bh->b_count) | + (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock))); +} + +static int +drop_buffers(struct page *page, struct buffer_head **buffers_to_free) +{ + struct buffer_head *head = page_buffers(page); + struct buffer_head *bh; + int was_uptodate = 1; + + bh = head; + do { + check_ttfb_buffer(page, bh); + if (buffer_write_io_error(bh)) + set_bit(AS_EIO, &page->mapping->flags); + if (buffer_busy(bh)) + goto failed; + if (!buffer_uptodate(bh) && !buffer_req(bh)) + was_uptodate = 0; + bh = bh->b_this_page; + } while (bh != head); + + if (!was_uptodate && PageUptodate(page)) + buffer_error(); + + do { + struct buffer_head *next = bh->b_this_page; + + if (!list_empty(&bh->b_assoc_buffers)) + __remove_assoc_queue(bh); + bh = next; + } while (bh != head); + *buffers_to_free = head; + __clear_page_buffers(page); + return 1; +failed: + return 0; +} + +int try_to_free_buffers(struct page *page) +{ + struct address_space * const mapping = page->mapping; + struct buffer_head *buffers_to_free = NULL; + int ret = 0; + + BUG_ON(!PageLocked(page)); + if (PageWriteback(page)) + return 0; + + if (mapping == NULL) { /* swapped-in anon page */ + ret = drop_buffers(page, &buffers_to_free); + goto out; + } + + spin_lock(&mapping->private_lock); + ret = drop_buffers(page, &buffers_to_free); + if (ret && !PageSwapCache(page)) { + /* + * If the filesystem writes its buffers by hand (eg ext3) + * then we can have clean buffers against a dirty page. We + * clean the page here; otherwise later reattachment of buffers + * could encounter a non-uptodate page, which is unresolvable. + * This only applies in the rare case where try_to_free_buffers + * succeeds but the page is not freed. + */ + clear_page_dirty(page); + } + spin_unlock(&mapping->private_lock); +out: + if (buffers_to_free) { + struct buffer_head *bh = buffers_to_free; + + do { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; + } while (bh != buffers_to_free); + } + return ret; +} +EXPORT_SYMBOL(try_to_free_buffers); + +int block_sync_page(struct page *page) +{ + blk_run_queues(); + return 0; +} + +/* + * There are no bdflush tunables left. But distributions are + * still running obsolete flush daemons, so we terminate them here. + * + * Use of bdflush() is deprecated and will be removed in a future kernel. + * The `pdflush' kernel threads fully replace bdflush daemons and this call. + */ +asmlinkage long sys_bdflush(int func, long data) +{ + static int msg_count; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (msg_count < 5) { + msg_count++; + printk(KERN_INFO + "warning: process `%s' used the obsolete bdflush" + " system call\n", current->comm); + printk(KERN_INFO "Fix your initscripts?\n"); + } + + if (func == 1) + do_exit(0); + return 0; +} + +/* + * Buffer-head allocation + */ +static kmem_cache_t *bh_cachep; + +/* + * Once the number of bh's in the machine exceeds this level, we start + * stripping them in writeback. + */ +static int max_buffer_heads; + +int buffer_heads_over_limit; + +struct bh_accounting { + int nr; /* Number of live bh's */ + int ratelimit; /* Limit cacheline bouncing */ +}; + +static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0}; + +static void recalc_bh_state(void) +{ + int i; + int tot = 0; + + if (__get_cpu_var(bh_accounting).ratelimit++ < 4096) + return; + __get_cpu_var(bh_accounting).ratelimit = 0; + for (i = 0; i < NR_CPUS; i++) { + if (cpu_online(i)) + tot += per_cpu(bh_accounting, i).nr; + } + buffer_heads_over_limit = (tot > max_buffer_heads); +} + +struct buffer_head *alloc_buffer_head(int gfp_flags) +{ + struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); + if (ret) { + preempt_disable(); + __get_cpu_var(bh_accounting).nr++; + recalc_bh_state(); + preempt_enable(); + } + return ret; +} +EXPORT_SYMBOL(alloc_buffer_head); + +void free_buffer_head(struct buffer_head *bh) +{ + BUG_ON(!list_empty(&bh->b_assoc_buffers)); + kmem_cache_free(bh_cachep, bh); + preempt_disable(); + __get_cpu_var(bh_accounting).nr--; + recalc_bh_state(); + preempt_enable(); +} +EXPORT_SYMBOL(free_buffer_head); + +static void +init_buffer_head(void *data, kmem_cache_t *cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + struct buffer_head * bh = (struct buffer_head *)data; + + memset(bh, 0, sizeof(*bh)); + INIT_LIST_HEAD(&bh->b_assoc_buffers); + } +} + +static void buffer_init_cpu(int cpu) +{ + struct bh_accounting *bha = &per_cpu(bh_accounting, cpu); + struct bh_lru *bhl = &per_cpu(bh_lrus, cpu); + + bha->nr = 0; + bha->ratelimit = 0; + memset(bhl, 0, sizeof(*bhl)); +} + +static int __devinit buffer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + switch(action) { + case CPU_UP_PREPARE: + buffer_init_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata buffer_nb = { + .notifier_call = buffer_cpu_notify, +}; + +void __init buffer_init(void) +{ + int i; + int nrpages; + + bh_cachep = kmem_cache_create("buffer_head", + sizeof(struct buffer_head), 0, + 0, init_buffer_head, NULL); + for (i = 0; i < ARRAY_SIZE(bh_wait_queue_heads); i++) + init_waitqueue_head(&bh_wait_queue_heads[i].wqh); + + /* + * Limit the bh occupancy to 10% of ZONE_NORMAL + */ + nrpages = (nr_free_buffer_pages() * 10) / 100; + max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); + buffer_cpu_notify(&buffer_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&buffer_nb); +} + +EXPORT_SYMBOL(__bforget); +EXPORT_SYMBOL(__brelse); +EXPORT_SYMBOL(__wait_on_buffer); +EXPORT_SYMBOL(block_commit_write); +EXPORT_SYMBOL(block_prepare_write); +EXPORT_SYMBOL(block_read_full_page); +EXPORT_SYMBOL(block_sync_page); +EXPORT_SYMBOL(block_truncate_page); +EXPORT_SYMBOL(block_write_full_page); +EXPORT_SYMBOL(buffer_insert_list); +EXPORT_SYMBOL(cont_prepare_write); +EXPORT_SYMBOL(end_buffer_async_write); +EXPORT_SYMBOL(end_buffer_read_sync); +EXPORT_SYMBOL(end_buffer_write_sync); +EXPORT_SYMBOL(file_fsync); +EXPORT_SYMBOL(fsync_bdev); +EXPORT_SYMBOL(fsync_buffers_list); +EXPORT_SYMBOL(generic_block_bmap); +EXPORT_SYMBOL(generic_commit_write); +EXPORT_SYMBOL(generic_cont_expand); +EXPORT_SYMBOL(init_buffer); +EXPORT_SYMBOL(invalidate_bdev); +EXPORT_SYMBOL(ll_rw_block); +EXPORT_SYMBOL(mark_buffer_dirty); +EXPORT_SYMBOL(submit_bh); +EXPORT_SYMBOL(sync_dirty_buffer); +EXPORT_SYMBOL(unlock_buffer); diff -Nru a/fs/direct-io.c b/fs/direct-io.c --- a/fs/direct-io.c Fri Oct 31 14:10:54 2003 +++ b/fs/direct-io.c Fri Oct 31 14:10:54 2003 @@ -677,7 +677,7 @@ this_chunk_bytes = this_chunk_blocks << dio->blkbits; - page = ZERO_PAGE(dio->cur_user_address); + page = ZERO_PAGE(dio->curr_user_address); if (submit_page_section(dio, page, 0, this_chunk_bytes, dio->next_block_for_io)) return; diff -Nru a/fs/fat/inode.c b/fs/fat/inode.c --- a/fs/fat/inode.c Fri Oct 31 14:10:53 2003 +++ b/fs/fat/inode.c Fri Oct 31 14:10:53 2003 @@ -964,13 +964,17 @@ error = first; goto out_fail; } - if (FAT_FIRST_ENT(sb, media) != first - && (media != 0xf8 || FAT_FIRST_ENT(sb, 0xfe) != first)) { - if (!silent) { + if (FAT_FIRST_ENT(sb, media) == first) { + /* all is as it should be */ + } else if (media == 0xf8 && FAT_FIRST_ENT(sb, 0xfe) == first) { + /* bad, reported on pc9800 */ + } else if (first == 0) { + /* bad, reported with a SmartMedia card */ + } else { + if (!silent) printk(KERN_ERR "FAT: invalid first entry of FAT " "(0x%x != 0x%x)\n", FAT_FIRST_ENT(sb, media), first); - } goto out_invalid; } diff -Nru a/fs/fs-writeback.c b/fs/fs-writeback.c --- a/fs/fs-writeback.c Fri Oct 31 14:10:54 2003 +++ b/fs/fs-writeback.c Fri Oct 31 14:10:54 2003 @@ -248,8 +248,8 @@ * on the writer throttling path, and we get decent balancing between many * throttled threads: we don't want them all piling up on __wait_on_inode. */ -static void -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) +void +generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ @@ -323,6 +323,16 @@ } return; /* Leave any unwritten inodes on s_io */ } + +static void +sync_sb_inodes (struct super_block *sb, struct writeback_control *wbc) +{ + if (sb->s_op->sync_inodes) + sb->s_op->sync_inodes(sb, wbc); + else + generic_sync_sb_inodes(sb, wbc); +} + /* * Start writeback of dirty pagecache data against all unlocked inodes. diff -Nru a/fs/fs-writeback.c~sb_sync_inodes.diff b/fs/fs-writeback.c~sb_sync_inodes.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/fs-writeback.c~sb_sync_inodes.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,589 @@ +/* + * fs/fs-writeback.c + * + * Copyright (C) 2002, Linus Torvalds. + * + * Contains all the functions related to writing back and waiting + * upon dirty inodes against superblocks, and writing back dirty + * pages against inodes. ie: data writeback. Writeout of the + * inode itself is not handled here. + * + * 10Apr2002 akpm@zip.com.au + * Split out of fs/inode.c + * Additions for address_space-based writeback + */ + +#include <linux/kernel.h> +#include <linux/spinlock.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/buffer_head.h> + +extern struct super_block *blockdev_superblock; + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. + * + * Put the inode on the super block's dirty list. + * + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. + * + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. + * + * This function *must* be atomic for the I_DIRTY_PAGES case - + * set_page_dirty() is called under spinlock in several places. + */ +void __mark_inode_dirty(struct inode *inode, int flags) +{ + struct super_block *sb = inode->i_sb; + + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } + + /* + * make sure that changes are seen by all cpus before we test i_state + * -- mikulas + */ + smp_mb(); + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + struct address_space *mapping = inode->i_mapping; + + inode->i_state |= flags; + + /* + * If the inode is locked, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_LOCK) + goto out; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (hlist_unhashed(&inode->i_hash)) + goto out; + if (inode->i_state & (I_FREEING|I_CLEAR)) + goto out; + } + + /* + * If the inode was already on s_dirty or s_io, don't + * reposition it (that would break s_dirty time-ordering). + */ + if (!was_dirty) { + mapping->dirtied_when = jiffies|1; /* 0 is special */ + list_move(&inode->i_list, &sb->s_dirty); + } + } +out: + spin_unlock(&inode_lock); +} + +EXPORT_SYMBOL(__mark_inode_dirty); + +static void write_inode(struct inode *inode, int sync) +{ + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) + inode->i_sb->s_op->write_inode(inode, sync); +} + +/* + * Write a single inode's dirty pages and inode data out to disk. + * If `wait' is set, wait on the writeout. + * + * The whole writeout design is quite complex and fragile. We want to avoid + * starvation of particular inodes when others are being redirtied, prevent + * livelocks, etc. + * + * So what we do is to move all pages which are to be written from dirty_pages + * onto io_pages. And keep on writing io_pages until it's empty. Refusing to + * move more pages onto io_pages until io_pages is empty. Once that point has + * been reached, we are ready to take another pass across the inode's dirty + * pages. + * + * Called under inode_lock. + */ +static void +__sync_single_inode(struct inode *inode, struct writeback_control *wbc) +{ + unsigned dirty; + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + int wait = wbc->sync_mode == WB_SYNC_ALL; + + BUG_ON(inode->i_state & I_LOCK); + + /* Set I_LOCK, reset I_DIRTY */ + dirty = inode->i_state & I_DIRTY; + inode->i_state |= I_LOCK; + inode->i_state &= ~I_DIRTY; + + /* + * smp_rmb(); note: if you remove write_lock below, you must add this. + * mark_inode_dirty doesn't take spinlock, make sure that inode is not + * read speculatively by this cpu before &= ~I_DIRTY -- mikulas + */ + + spin_lock(&mapping->page_lock); + if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) + list_splice_init(&mapping->dirty_pages, &mapping->io_pages); + spin_unlock(&mapping->page_lock); + spin_unlock(&inode_lock); + + do_writepages(mapping, wbc); + + /* Don't write the inode if only I_DIRTY_PAGES was set */ + if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) + write_inode(inode, wait); + + if (wait) + filemap_fdatawait(mapping); + + spin_lock(&inode_lock); + inode->i_state &= ~I_LOCK; + if (!(inode->i_state & I_FREEING)) { + if (!list_empty(&mapping->io_pages)) { + /* Needs more writeback */ + inode->i_state |= I_DIRTY_PAGES; + } else if (!list_empty(&mapping->dirty_pages)) { + /* Redirtied */ + inode->i_state |= I_DIRTY_PAGES; + mapping->dirtied_when = jiffies|1; + list_move(&inode->i_list, &sb->s_dirty); + } else if (inode->i_state & I_DIRTY) { + /* Redirtied */ + mapping->dirtied_when = jiffies|1; + list_move(&inode->i_list, &sb->s_dirty); + } else if (atomic_read(&inode->i_count)) { + mapping->dirtied_when = 0; + list_move(&inode->i_list, &inode_in_use); + } else { + mapping->dirtied_when = 0; + list_move(&inode->i_list, &inode_unused); + } + } + wake_up_inode(inode); +} + +/* + * Write out an inode's dirty pages. Called under inode_lock. + */ +static void +__writeback_single_inode(struct inode *inode, + struct writeback_control *wbc) +{ + if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) { + list_move(&inode->i_list, &inode->i_sb->s_dirty); + return; + } + + /* + * It's a data-integrity sync. We must wait. + */ + while (inode->i_state & I_LOCK) { + __iget(inode); + spin_unlock(&inode_lock); + __wait_on_inode(inode); + iput(inode); + spin_lock(&inode_lock); + } + __sync_single_inode(inode, wbc); +} + +/* + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. + * + * If older_than_this is non-NULL, then only write out mappings which + * had their first dirtying at a time earlier than *older_than_this. + * + * If we're a pdlfush thread, then implement pdflush collision avoidance + * against the entire list. + * + * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so + * that it can be located for waiting on in __writeback_single_inode(). + * + * Called under inode_lock. + * + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. + * + * FIXME: this linear search could get expensive with many fileystems. But + * how to fix? We need to go from an address_space to all inodes which share + * a queue with that address_space. (Easy: have a global "dirty superblocks" + * list). + * + * The inodes to be written are parked on sb->s_io. They are moved back onto + * sb->s_dirty as they are selected for writing. This way, none can be missed + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on __wait_on_inode. + */ +static void +sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) +{ + const unsigned long start = jiffies; /* livelock avoidance */ + + if (!wbc->for_kupdate || list_empty(&sb->s_io)) + list_splice_init(&sb->s_dirty, &sb->s_io); + + while (!list_empty(&sb->s_io)) { + struct inode *inode = list_entry(sb->s_io.prev, + struct inode, i_list); + struct address_space *mapping = inode->i_mapping; + struct backing_dev_info *bdi = mapping->backing_dev_info; + + if (bdi->memory_backed) { + if (sb == blockdev_superblock) { + /* + * Dirty memory-backed blockdev: the ramdisk + * driver does this. + */ + list_move(&inode->i_list, &sb->s_dirty); + continue; + } + /* + * Assume that all inodes on this superblock are memory + * backed. Skip the superblock. + */ + break; + } + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + if (sb != blockdev_superblock) + break; /* Skip a congested fs */ + list_move(&inode->i_list, &sb->s_dirty); + continue; /* Skip a congested blockdev */ + } + + if (wbc->bdi && bdi != wbc->bdi) { + if (sb != blockdev_superblock) + break; /* fs has the wrong queue */ + list_move(&inode->i_list, &sb->s_dirty); + continue; /* blockdev has wrong queue */ + } + + /* Was this inode dirtied after sync_sb_inodes was called? */ + if (time_after(mapping->dirtied_when, start)) + break; + + /* Was this inode dirtied too recently? */ + if (wbc->older_than_this && time_after(mapping->dirtied_when, + *wbc->older_than_this)) + break; + + /* Is another pdflush already flushing this queue? */ + if (current_is_pdflush() && !writeback_acquire(bdi)) + break; + + BUG_ON(inode->i_state & I_FREEING); + __iget(inode); + __writeback_single_inode(inode, wbc); + if (wbc->sync_mode == WB_SYNC_HOLD) { + mapping->dirtied_when = jiffies|1; + list_move(&inode->i_list, &sb->s_dirty); + } + if (current_is_pdflush()) + writeback_release(bdi); + spin_unlock(&inode_lock); + iput(inode); + spin_lock(&inode_lock); + if (wbc->nr_to_write <= 0) + break; + } + return; /* Leave any unwritten inodes on s_io */ +} + +/* + * Start writeback of dirty pagecache data against all unlocked inodes. + * + * Note: + * We don't need to grab a reference to superblock here. If it has non-empty + * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed + * past sync_inodes_sb() until both the ->s_dirty and ->s_io lists are + * empty. Since __sync_single_inode() regains inode_lock before it finally moves + * inode from superblock lists we are OK. + * + * If `older_than_this' is non-zero then only flush inodes which have a + * flushtime older than *older_than_this. + * + * If `bdi' is non-zero then we will scan the first inode against each + * superblock until we find the matching ones. One group will be the dirty + * inodes against a filesystem. Then when we hit the dummy blockdev superblock, + * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not + * super-efficient but we're about to do a ton of I/O... + */ +void +writeback_inodes(struct writeback_control *wbc) +{ + struct super_block *sb; + + spin_lock(&inode_lock); + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.prev); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { + if (!list_empty(&sb->s_dirty) || !list_empty(&sb->s_io)) { + spin_unlock(&sb_lock); + sync_sb_inodes(sb, wbc); + spin_lock(&sb_lock); + } + if (wbc->nr_to_write <= 0) + break; + } + spin_unlock(&sb_lock); + spin_unlock(&inode_lock); +} + +/* + * writeback and wait upon the filesystem's dirty inodes. The caller will + * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is + * used to park the written inodes on sb->s_dirty for the wait pass. + * + * A finite limit is set on the number of pages which will be written. + * To prevent infinite livelock of sys_sync(). + * + * We add in the number of potentially dirty inodes, because each inode write + * can dirty pagecache in the underlying blockdev. + */ +void sync_inodes_sb(struct super_block *sb, int wait) +{ + struct page_state ps; + struct writeback_control wbc = { + .bdi = NULL, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .older_than_this = NULL, + .nr_to_write = 0, + }; + + get_page_state(&ps); + wbc.nr_to_write = ps.nr_dirty + ps.nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused) + + ps.nr_dirty + ps.nr_unstable; + wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ + spin_lock(&inode_lock); + sync_sb_inodes(sb, &wbc); + spin_unlock(&inode_lock); +} + +/* + * Rather lame livelock avoidance. + */ +static void set_sb_syncing(int val) +{ + struct super_block *sb; + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.prev); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { + sb->s_syncing = val; + } + spin_unlock(&sb_lock); +} + +/* + * Find a superblock with inodes that need to be synced + */ +static struct super_block *get_super_to_sync(void) +{ + struct super_block *sb; +restart: + spin_lock(&sb_lock); + sb = sb_entry(super_blocks.prev); + for (; sb != sb_entry(&super_blocks); sb = sb_entry(sb->s_list.prev)) { + if (sb->s_syncing) + continue; + sb->s_syncing = 1; + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + if (!sb->s_root) { + drop_super(sb); + goto restart; + } + return sb; + } + spin_unlock(&sb_lock); + return NULL; +} + +/** + * sync_inodes + * + * sync_inodes() goes through each super block's dirty inode list, writes the + * inodes out, waits on the writeout and puts the inodes back on the normal + * list. + * + * This is for sys_sync(). fsync_dev() uses the same algorithm. The subtle + * part of the sync functions is that the blockdev "superblock" is processed + * last. This is because the write_inode() function of a typical fs will + * perform no I/O, but will mark buffers in the blockdev mapping as dirty. + * What we want to do is to perform all that dirtying first, and then write + * back all those inode blocks via the blockdev mapping in one sweep. So the + * additional (somewhat redundant) sync_blockdev() calls here are to make + * sure that really happens. Because if we call sync_inodes_sb(wait=1) with + * outstanding dirty inodes, the writeback goes block-at-a-time within the + * filesystem's write_inode(). This is extremely slow. + */ +void sync_inodes(int wait) +{ + struct super_block *sb; + + set_sb_syncing(0); + while ((sb = get_super_to_sync()) != NULL) { + sync_inodes_sb(sb, 0); + sync_blockdev(sb->s_bdev); + drop_super(sb); + } + if (wait) { + set_sb_syncing(0); + while ((sb = get_super_to_sync()) != NULL) { + sync_inodes_sb(sb, 1); + sync_blockdev(sb->s_bdev); + drop_super(sb); + } + } +} + +/** + * write_inode_now - write an inode to disk + * @inode: inode to write to disk + * @sync: whether the write should be synchronous or not + * + * This function commits an inode to disk immediately if it is + * dirty. This is primarily needed by knfsd. + */ + +void write_inode_now(struct inode *inode, int sync) +{ + struct writeback_control wbc = { + .nr_to_write = LONG_MAX, + .sync_mode = WB_SYNC_ALL, + }; + + spin_lock(&inode_lock); + __writeback_single_inode(inode, &wbc); + spin_unlock(&inode_lock); + if (sync) + wait_on_inode(inode); +} + +EXPORT_SYMBOL(write_inode_now); + +/** + * generic_osync_inode - flush all dirty data for a given inode to disk + * @inode: inode to write + * @what: what to write and wait upon + * + * This can be called by file_write functions for files which have the + * O_SYNC flag set, to flush dirty writes to disk. + * + * @what is a bitmask, specifying which part of the inode's data should be + * written and waited upon: + * + * OSYNC_DATA: i_mapping's dirty data + * OSYNC_METADATA: the buffers at i_mapping->private_list + * OSYNC_INODE: the inode itself + */ + +int generic_osync_inode(struct inode *inode, int what) +{ + int err = 0; + int need_write_inode_now = 0; + int err2; + + current->flags |= PF_SYNCWRITE; + if (what & OSYNC_DATA) + err = filemap_fdatawrite(inode->i_mapping); + if (what & (OSYNC_METADATA|OSYNC_DATA)) { + err2 = sync_mapping_buffers(inode->i_mapping); + if (!err) + err = err2; + } + if (what & OSYNC_DATA) { + err2 = filemap_fdatawait(inode->i_mapping); + if (!err) + err = err2; + } + current->flags &= ~PF_SYNCWRITE; + + spin_lock(&inode_lock); + if ((inode->i_state & I_DIRTY) && + ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) + need_write_inode_now = 1; + spin_unlock(&inode_lock); + + if (need_write_inode_now) + write_inode_now(inode, 1); + else + wait_on_inode(inode); + + return err; +} + +EXPORT_SYMBOL(generic_osync_inode); + +/** + * writeback_acquire: attempt to get exclusive writeback access to a device + * @bdi: the device's backing_dev_info structure + * + * It is a waste of resources to have more than one pdflush thread blocked on + * a single request queue. Exclusion at the request_queue level is obtained + * via a flag in the request_queue's backing_dev_info.state. + * + * Non-request_queue-backed address_spaces will share default_backing_dev_info, + * unless they implement their own. Which is somewhat inefficient, as this + * may prevent concurrent writeback against multiple devices. + */ +int writeback_acquire(struct backing_dev_info *bdi) +{ + return !test_and_set_bit(BDI_pdflush, &bdi->state); +} + +/** + * writeback_in_progress: determine whether there is writeback in progress + * against a backing device. + * @bdi: the device's backing_dev_info structure. + */ +int writeback_in_progress(struct backing_dev_info *bdi) +{ + return test_bit(BDI_pdflush, &bdi->state); +} + +/** + * writeback_release: relinquish exclusive writeback access against a device. + * @bdi: the device's backing_dev_info structure + */ +void writeback_release(struct backing_dev_info *bdi) +{ + BUG_ON(!writeback_in_progress(bdi)); + clear_bit(BDI_pdflush, &bdi->state); +} diff -Nru a/fs/hostfs/Makefile b/fs/hostfs/Makefile --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/hostfs/Makefile Fri Oct 31 14:10:55 2003 @@ -0,0 +1,36 @@ +# +# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +# struct stat64 changed the inode field name between 2.2 and 2.4 from st_ino +# to __st_ino. It stayed in the same place, so as long as the correct name +# is used, hostfs compiled on 2.2 should work on 2.4 and vice versa. + +STAT64_INO_FIELD := $(shell grep -q __st_ino /usr/include/bits/stat.h && \ + echo __)st_ino + +hostfs-objs := hostfs_kern.o hostfs_user.o + +obj-y = +obj-$(CONFIG_HOSTFS) += hostfs.o + +SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs)) + +USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(SINGLE_OBJS)) +USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) + +USER_CFLAGS += -DSTAT64_INO_FIELD=$(STAT64_INO_FIELD) + +$(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< + +clean: + +modules: + +fastdep: + +dep: + +archmrproper: clean diff -Nru a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/hostfs/hostfs.h Fri Oct 31 14:10:54 2003 @@ -0,0 +1,79 @@ +#ifndef __UM_FS_HOSTFS +#define __UM_FS_HOSTFS + +#include "os.h" + +/* These are exactly the same definitions as in fs.h, but the names are + * changed so that this file can be included in both kernel and user files. + */ + +#define HOSTFS_ATTR_MODE 1 +#define HOSTFS_ATTR_UID 2 +#define HOSTFS_ATTR_GID 4 +#define HOSTFS_ATTR_SIZE 8 +#define HOSTFS_ATTR_ATIME 16 +#define HOSTFS_ATTR_MTIME 32 +#define HOSTFS_ATTR_CTIME 64 +#define HOSTFS_ATTR_ATIME_SET 128 +#define HOSTFS_ATTR_MTIME_SET 256 +#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ +#define HOSTFS_ATTR_ATTR_FLAG 1024 + +struct hostfs_iattr { + unsigned int ia_valid; + mode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + struct timespec ia_atime; + struct timespec ia_mtime; + struct timespec ia_ctime; + unsigned int ia_attr_flags; +}; + +extern int stat_file(const char *path, unsigned long long *inode_out, + int *mode_out, int *nlink_out, int *uid_out, int *gid_out, + unsigned long long *size_out, struct timespec *atime_out, + struct timespec *mtime_out, struct timespec *ctime_out, + int *blksize_out, unsigned long long *blocks_out); +extern int access_file(char *path, int r, int w, int x); +extern int open_file(char *path, int r, int w, int append); +extern int file_type(const char *path, int *rdev); +extern void *open_dir(char *path, int *err_out); +extern char *read_dir(void *stream, unsigned long long *pos, + unsigned long long *ino_out, int *len_out); +extern void close_file(void *stream); +extern void close_dir(void *stream); +extern int read_file(int fd, unsigned long long *offset, char *buf, int len); +extern int write_file(int fd, unsigned long long *offset, const char *buf, + int len); +extern int lseek_file(int fd, long long offset, int whence); +extern int file_create(char *name, int ur, int uw, int ux, int gr, + int gw, int gx, int or, int ow, int ox); +extern int set_attr(const char *file, struct hostfs_iattr *attrs); +extern int make_symlink(const char *from, const char *to); +extern int unlink_file(const char *file); +extern int do_mkdir(const char *file, int mode); +extern int do_rmdir(const char *file); +extern int do_mknod(const char *file, int mode, int dev); +extern int link_file(const char *from, const char *to); +extern int do_readlink(char *file, char *buf, int size); +extern int rename_file(char *from, char *to); +extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, + long long *bfree_out, long long *bavail_out, + long long *files_out, long long *ffree_out, + void *fsid_out, int fsid_size, long *namelen_out, + long *spare_out); + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/hostfs/hostfs_kern.c Fri Oct 31 14:10:54 2003 @@ -0,0 +1,1008 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + * + * Ported the filesystem routines to 2.5. + * 2003-02-10 Petr Baudis <pasky@ucw.cz> + */ + +#include <linux/stddef.h> +#include <linux/fs.h> +#include <linux/version.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <linux/blkdev.h> +#include <linux/list.h> +#include <linux/buffer_head.h> +#include <linux/root_dev.h> +#include <linux/statfs.h> +#include <asm/uaccess.h> +#include "hostfs.h" +#include "kern_util.h" +#include "kern.h" +#include "user_util.h" +#include "2_5compat.h" +#include "init.h" + +struct hostfs_inode_info { + char *host_filename; + int fd; + int mode; + struct inode vfs_inode; +}; + +static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) +{ + return(list_entry(inode, struct hostfs_inode_info, vfs_inode)); +} + +#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_dentry->d_inode) + +int hostfs_d_delete(struct dentry *dentry) +{ + return(1); +} + +struct dentry_operations hostfs_dentry_ops = { + .d_delete = hostfs_d_delete, +}; + +/* Changed in hostfs_args before the kernel starts running */ +static char *root_ino = "/"; +static int append = 0; + +#define HOSTFS_SUPER_MAGIC 0x00c0ffee + +static struct inode_operations hostfs_iops; +static struct inode_operations hostfs_dir_iops; +static struct address_space_operations hostfs_link_aops; + +static int __init hostfs_args(char *options, int *add) +{ + char *ptr; + + ptr = strchr(options, ','); + if(ptr != NULL) + *ptr++ = '\0'; + if(*options != '\0') + root_ino = options; + + options = ptr; + while(options){ + ptr = strchr(options, ','); + if(ptr != NULL) + *ptr++ = '\0'; + if(*options != '\0'){ + if(!strcmp(options, "append")) + append = 1; + else printf("hostfs_args - unsupported option - %s\n", + options); + } + options = ptr; + } + return(0); +} + +__uml_setup("hostfs=", hostfs_args, +"hostfs=<root dir>,<flags>,...\n" +" This is used to set hostfs parameters. The root directory argument\n" +" is used to confine all hostfs mounts to within the specified directory\n" +" tree on the host. If this isn't specified, then a user inside UML can\n" +" mount anything on the host that's accessible to the user that's running\n" +" it.\n" +" The only flag currently supported is 'append', which specifies that all\n" +" files opened by hostfs will be opened in append mode.\n\n" +); + +static char *dentry_name(struct dentry *dentry, int extra) +{ + struct dentry *parent; + char *root, *name; + int len; + + len = 0; + parent = dentry; + while(parent->d_parent != parent){ + len += parent->d_name.len + 1; + parent = parent->d_parent; + } + + root = HOSTFS_I(parent->d_inode)->host_filename; + len += strlen(root); + name = kmalloc(len + extra + 1, GFP_KERNEL); + if(name == NULL) return(NULL); + + name[len] = '\0'; + parent = dentry; + while(parent->d_parent != parent){ + len -= parent->d_name.len + 1; + name[len] = '/'; + strncpy(&name[len + 1], parent->d_name.name, + parent->d_name.len); + parent = parent->d_parent; + } + strncpy(name, root, strlen(root)); + return(name); +} + +static char *inode_name(struct inode *ino, int extra) +{ + struct dentry *dentry; + + dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias); + return(dentry_name(dentry, extra)); +} + +static int read_name(struct inode *ino, char *name) +{ + /* The non-int inode fields are copied into ints by stat_file and + * then copied into the inode because passing the actual pointers + * in and having them treated as int * breaks on big-endian machines + */ + int err; + int i_mode, i_nlink, i_blksize; + unsigned long long i_size; + unsigned long long i_ino; + unsigned long long i_blocks; + + err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid, + &ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime, + &ino->i_ctime, &i_blksize, &i_blocks); + if(err) + return(err); + + ino->i_ino = i_ino; + ino->i_mode = i_mode; + ino->i_nlink = i_nlink; + ino->i_size = i_size; + ino->i_blksize = i_blksize; + ino->i_blocks = i_blocks; + if((ino->i_sb->s_dev == ROOT_DEV) && (ino->i_uid == getuid())) + ino->i_uid = 0; + return(0); +} + +static char *follow_link(char *link) +{ + int len, n; + char *name, *resolved, *end; + + len = 64; + while(1){ + n = -ENOMEM; + name = kmalloc(len, GFP_KERNEL); + if(name == NULL) + goto out; + + n = do_readlink(link, name, len); + if(n < len) + break; + len *= 2; + kfree(name); + } + if(n < 0) + goto out_free; + + if(*name == '/') + return(name); + + end = strrchr(link, '/'); + if(end == NULL) + return(name); + + *(end + 1) = '\0'; + len = strlen(link) + strlen(name) + 1; + + resolved = kmalloc(len, GFP_KERNEL); + if(resolved == NULL){ + n = -ENOMEM; + goto out_free; + } + + sprintf(resolved, "%s%s", link, name); + kfree(name); + kfree(link); + return(resolved); + + out_free: + kfree(name); + out: + return(ERR_PTR(n)); +} + +static int read_inode(struct inode *ino) +{ + char *name; + int err = 0; + + /* Unfortunately, we are called from iget() when we don't have a dentry + * allocated yet. + */ + if(list_empty(&ino->i_dentry)) + goto out; + + err = -ENOMEM; + name = inode_name(ino, 0); + if(name == NULL) + goto out; + + if(file_type(name, NULL) == OS_TYPE_SYMLINK){ + name = follow_link(name); + if(IS_ERR(name)){ + err = PTR_ERR(name); + goto out; + } + } + + err = read_name(ino, name); + kfree(name); + out: + return(err); +} + +int hostfs_statfs(struct super_block *sb, struct kstatfs *sf) +{ + /* do_statfs uses struct statfs64 internally, but the linux kernel + * struct statfs still has 32-bit versions for most of these fields, + * so we convert them here + */ + int err; + long long f_blocks; + long long f_bfree; + long long f_bavail; + long long f_files; + long long f_ffree; + + err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename, + &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, + &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), + &sf->f_namelen, sf->f_spare); + if(err) return(err); + sf->f_blocks = f_blocks; + sf->f_bfree = f_bfree; + sf->f_bavail = f_bavail; + sf->f_files = f_files; + sf->f_ffree = f_ffree; + sf->f_type = HOSTFS_SUPER_MAGIC; + return(0); +} + +static struct inode *hostfs_alloc_inode(struct super_block *sb) +{ + struct hostfs_inode_info *hi; + + hi = kmalloc(sizeof(*hi), GFP_KERNEL); + if(hi == NULL) + return(NULL); + + *hi = ((struct hostfs_inode_info) { .host_filename = NULL, + .fd = -1, + .mode = 0 }); + inode_init_once(&hi->vfs_inode); + return(&hi->vfs_inode); +} + +static void hostfs_destroy_inode(struct inode *inode) +{ + if(HOSTFS_I(inode)->host_filename) + kfree(HOSTFS_I(inode)->host_filename); + + if(HOSTFS_I(inode)->fd != -1) + close_file(&HOSTFS_I(inode)->fd); + + kfree(HOSTFS_I(inode)); +} + +static void hostfs_read_inode(struct inode *inode) +{ + read_inode(inode); +} + +static struct super_operations hostfs_sbops = { + .alloc_inode = hostfs_alloc_inode, + .destroy_inode = hostfs_destroy_inode, + .read_inode = hostfs_read_inode, + .statfs = hostfs_statfs, +}; + +int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) +{ + void *dir; + char *name; + unsigned long long next, ino; + int error, len; + + name = dentry_name(file->f_dentry, 0); + if(name == NULL) return(-ENOMEM); + dir = open_dir(name, &error); + kfree(name); + if(dir == NULL) return(-error); + next = file->f_pos; + while((name = read_dir(dir, &next, &ino, &len)) != NULL){ + error = (*filldir)(ent, name, len, file->f_pos, + ino, DT_UNKNOWN); + if(error) break; + file->f_pos = next; + } + close_dir(dir); + return(0); +} + +int hostfs_file_open(struct inode *ino, struct file *file) +{ + char *name; + int mode = 0, r = 0, w = 0, fd; + + mode = file->f_mode & (FMODE_READ | FMODE_WRITE); + if((mode & HOSTFS_I(ino)->mode) == mode) + return(0); + + /* The file may already have been opened, but with the wrong access, + * so this resets things and reopens the file with the new access. + */ + if(HOSTFS_I(ino)->fd != -1){ + close_file(&HOSTFS_I(ino)->fd); + HOSTFS_I(ino)->fd = -1; + } + + HOSTFS_I(ino)->mode |= mode; + if(HOSTFS_I(ino)->mode & FMODE_READ) + r = 1; + if(HOSTFS_I(ino)->mode & FMODE_WRITE) + w = 1; + if(w) + r = 1; + + name = dentry_name(file->f_dentry, 0); + if(name == NULL) + return(-ENOMEM); + + fd = open_file(name, r, w, append); + kfree(name); + if(fd < 0) return(fd); + FILE_HOSTFS_I(file)->fd = fd; + + return(0); +} + +int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + return(0); +} + +static struct file_operations hostfs_file_fops = { + .llseek = generic_file_llseek, + .read = generic_file_read, + .write = generic_file_write, + .mmap = generic_file_mmap, + .open = hostfs_file_open, + .release = NULL, + .fsync = hostfs_fsync, +}; + +static struct file_operations hostfs_dir_fops = { + .readdir = hostfs_readdir, + .read = generic_read_dir, +}; + +int hostfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *buffer; + unsigned long long base; + int count = PAGE_CACHE_SIZE; + int end_index = inode->i_size >> PAGE_CACHE_SHIFT; + int err; + + if (page->index >= end_index) + count = inode->i_size & (PAGE_CACHE_SIZE-1); + + buffer = kmap(page); + base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT; + + err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); + if(err != count){ + ClearPageUptodate(page); + goto out; + } + + if (base > inode->i_size) + inode->i_size = base; + + if (PageError(page)) + ClearPageError(page); + err = 0; + + out: + kunmap(page); + + unlock_page(page); + return err; +} + +int hostfs_readpage(struct file *file, struct page *page) +{ + char *buffer; + long long start; + int err = 0; + + start = (long long) page->index << PAGE_CACHE_SHIFT; + buffer = kmap(page); + err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, + PAGE_CACHE_SIZE); + if(err < 0) goto out; + + memset(&buffer[err], 0, PAGE_CACHE_SIZE - err); + + flush_dcache_page(page); + SetPageUptodate(page); + if (PageError(page)) ClearPageError(page); + err = 0; + out: + kunmap(page); + unlock_page(page); + return(err); +} + +int hostfs_prepare_write(struct file *file, struct page *page, + unsigned int from, unsigned int to) +{ + char *buffer; + long long start, tmp; + int err; + + start = (long long) page->index << PAGE_CACHE_SHIFT; + buffer = kmap(page); + if(from != 0){ + tmp = start; + err = read_file(FILE_HOSTFS_I(file)->fd, &tmp, buffer, + from); + if(err < 0) goto out; + } + if(to != PAGE_CACHE_SIZE){ + start += to; + err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer + to, + PAGE_CACHE_SIZE - to); + if(err < 0) goto out; + } + err = 0; + out: + kunmap(page); + return(err); +} + +int hostfs_commit_write(struct file *file, struct page *page, unsigned from, + unsigned to) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + char *buffer; + long long start; + int err = 0; + + start = (long long) (page->index << PAGE_CACHE_SHIFT) + from; + buffer = kmap(page); + err = write_file(FILE_HOSTFS_I(file)->fd, &start, buffer + from, + to - from); + if(err > 0) err = 0; + if(!err && (start > inode->i_size)) + inode->i_size = start; + + kunmap(page); + return(err); +} + +static struct address_space_operations hostfs_aops = { + .writepage = hostfs_writepage, + .readpage = hostfs_readpage, +/* .set_page_dirty = __set_page_dirty_nobuffers, */ + .prepare_write = hostfs_prepare_write, + .commit_write = hostfs_commit_write +}; + +static int init_inode(struct inode *inode, struct dentry *dentry) +{ + char *name; + int type, err = -ENOMEM, rdev; + + if(dentry){ + name = dentry_name(dentry, 0); + if(name == NULL) + goto out; + type = file_type(name, &rdev); + kfree(name); + } + else type = OS_TYPE_DIR; + + err = 0; + if(type == OS_TYPE_SYMLINK) + inode->i_op = &page_symlink_inode_operations; + else if(type == OS_TYPE_DIR) + inode->i_op = &hostfs_dir_iops; + else inode->i_op = &hostfs_iops; + + if(type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops; + else inode->i_fop = &hostfs_file_fops; + + if(type == OS_TYPE_SYMLINK) + inode->i_mapping->a_ops = &hostfs_link_aops; + else inode->i_mapping->a_ops = &hostfs_aops; + + switch (type) { + case OS_TYPE_CHARDEV: + init_special_inode(inode, S_IFCHR, rdev); + break; + case OS_TYPE_BLOCKDEV: + init_special_inode(inode, S_IFBLK, rdev); + break; + case OS_TYPE_FIFO: + init_special_inode(inode, S_IFIFO, 0); + break; + case OS_TYPE_SOCK: + init_special_inode(inode, S_IFSOCK, 0); + break; + } + out: + return(err); +} + +int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + char *name; + int error, fd; + + error = -ENOMEM; + inode = iget(dir->i_sb, 0); + if(inode == NULL) goto out; + + error = init_inode(inode, dentry); + if(error) + goto out_put; + + error = -ENOMEM; + name = dentry_name(dentry, 0); + if(name == NULL) + goto out_put; + + fd = file_create(name, + mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, + mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, + mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH); + if(fd < 0) + error = fd; + else error = read_name(inode, name); + + kfree(name); + if(error) + goto out_put; + + HOSTFS_I(inode)->fd = fd; + HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE; + d_instantiate(dentry, inode); + return(0); + + out_put: + iput(inode); + out: + return(error); +} + +struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + char *name; + int err; + + err = -ENOMEM; + inode = iget(ino->i_sb, 0); + if(inode == NULL) + goto out; + + err = init_inode(inode, dentry); + if(err) + goto out_put; + + err = -ENOMEM; + name = dentry_name(dentry, 0); + if(name == NULL) + goto out_put; + + err = read_name(inode, name); + kfree(name); + if(err == -ENOENT){ + iput(inode); + inode = NULL; + } + else if(err) + goto out_put; + + d_add(dentry, inode); + dentry->d_op = &hostfs_dentry_ops; + return(NULL); + + out_put: + iput(inode); + out: + return(ERR_PTR(err)); +} + +static char *inode_dentry_name(struct inode *ino, struct dentry *dentry) +{ + char *file; + int len; + + file = inode_name(ino, dentry->d_name.len + 1); + if(file == NULL) return(NULL); + strcat(file, "/"); + len = strlen(file); + strncat(file, dentry->d_name.name, dentry->d_name.len); + file[len + dentry->d_name.len] = '\0'; + return(file); +} + +int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) +{ + char *from_name, *to_name; + int err; + + if((from_name = inode_dentry_name(ino, from)) == NULL) + return(-ENOMEM); + to_name = dentry_name(to, 0); + if(to_name == NULL){ + kfree(from_name); + return(-ENOMEM); + } + err = link_file(to_name, from_name); + kfree(from_name); + kfree(to_name); + return(err); +} + +int hostfs_unlink(struct inode *ino, struct dentry *dentry) +{ + char *file; + int err; + + if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); + if(append) + return(-EPERM); + + err = unlink_file(file); + kfree(file); + return(err); +} + +int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) +{ + char *file; + int err; + + if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); + err = make_symlink(file, to); + kfree(file); + return(err); +} + +int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode) +{ + char *file; + int err; + + if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); + err = do_mkdir(file, mode); + kfree(file); + return(err); +} + +int hostfs_rmdir(struct inode *ino, struct dentry *dentry) +{ + char *file; + int err; + + if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); + err = do_rmdir(file); + kfree(file); + return(err); +} + +int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + struct inode *inode; + char *name; + int err = -ENOMEM; + + inode = iget(dir->i_sb, 0); + if(inode == NULL) + goto out; + + err = init_inode(inode, dentry); + if(err) + goto out_put; + + err = -ENOMEM; + name = dentry_name(dentry, 0); + if(name == NULL) + goto out_put; + + init_special_inode(inode, mode, dev); + err = do_mknod(name, mode, dev); + if(err) + goto out_free; + + err = read_name(inode, name); + kfree(name); + if(err) + goto out_put; + + d_instantiate(dentry, inode); + return(0); + + out_free: + kfree(name); + out_put: + iput(inode); + out: + return(err); +} + +int hostfs_rename(struct inode *from_ino, struct dentry *from, + struct inode *to_ino, struct dentry *to) +{ + char *from_name, *to_name; + int err; + + if((from_name = inode_dentry_name(from_ino, from)) == NULL) + return(-ENOMEM); + if((to_name = inode_dentry_name(to_ino, to)) == NULL){ + kfree(from_name); + return(-ENOMEM); + } + err = rename_file(from_name, to_name); + kfree(from_name); + kfree(to_name); + return(err); +} + +void hostfs_truncate(struct inode *ino) +{ + not_implemented(); +} + +int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd) +{ + char *name; + int r = 0, w = 0, x = 0, err; + + if(desired & MAY_READ) r = 1; + if(desired & MAY_WRITE) w = 1; + if(desired & MAY_EXEC) x = 1; + name = inode_name(ino, 0); + if(name == NULL) return(-ENOMEM); + err = access_file(name, r, w, x); + kfree(name); + if(!err) err = vfs_permission(ino, desired); + return(err); +} + +int hostfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct hostfs_iattr attrs; + char *name; + int err; + + if(append) + attr->ia_valid &= ~ATTR_SIZE; + + attrs.ia_valid = 0; + if(attr->ia_valid & ATTR_MODE){ + attrs.ia_valid |= HOSTFS_ATTR_MODE; + attrs.ia_mode = attr->ia_mode; + } + if(attr->ia_valid & ATTR_UID){ + if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && + (attr->ia_uid == 0)) + attr->ia_uid = getuid(); + attrs.ia_valid |= HOSTFS_ATTR_UID; + attrs.ia_uid = attr->ia_uid; + } + if(attr->ia_valid & ATTR_GID){ + if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && + (attr->ia_gid == 0)) + attr->ia_gid = getuid(); + attrs.ia_valid |= HOSTFS_ATTR_GID; + attrs.ia_gid = attr->ia_gid; + } + if(attr->ia_valid & ATTR_SIZE){ + attrs.ia_valid |= HOSTFS_ATTR_SIZE; + attrs.ia_size = attr->ia_size; + } + if(attr->ia_valid & ATTR_ATIME){ + attrs.ia_valid |= HOSTFS_ATTR_ATIME; + attrs.ia_atime = attr->ia_atime; + } + if(attr->ia_valid & ATTR_MTIME){ + attrs.ia_valid |= HOSTFS_ATTR_MTIME; + attrs.ia_mtime = attr->ia_mtime; + } + if(attr->ia_valid & ATTR_CTIME){ + attrs.ia_valid |= HOSTFS_ATTR_CTIME; + attrs.ia_ctime = attr->ia_ctime; + } + if(attr->ia_valid & ATTR_ATIME_SET){ + attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; + } + if(attr->ia_valid & ATTR_MTIME_SET){ + attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET; + } + name = dentry_name(dentry, 0); + if(name == NULL) return(-ENOMEM); + err = set_attr(name, &attrs); + kfree(name); + if(err) + return(err); + + return(inode_setattr(dentry->d_inode, attr)); +} + +int hostfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + generic_fillattr(dentry->d_inode, stat); + return(0); +} + +static struct inode_operations hostfs_iops = { + .create = hostfs_create, + .link = hostfs_link, + .unlink = hostfs_unlink, + .symlink = hostfs_symlink, + .mkdir = hostfs_mkdir, + .rmdir = hostfs_rmdir, + .mknod = hostfs_mknod, + .rename = hostfs_rename, + .truncate = hostfs_truncate, + .permission = hostfs_permission, + .setattr = hostfs_setattr, + .getattr = hostfs_getattr, +}; + +static struct inode_operations hostfs_dir_iops = { + .create = hostfs_create, + .lookup = hostfs_lookup, + .link = hostfs_link, + .unlink = hostfs_unlink, + .symlink = hostfs_symlink, + .mkdir = hostfs_mkdir, + .rmdir = hostfs_rmdir, + .mknod = hostfs_mknod, + .rename = hostfs_rename, + .truncate = hostfs_truncate, + .permission = hostfs_permission, + .setattr = hostfs_setattr, + .getattr = hostfs_getattr, +}; + +int hostfs_link_readpage(struct file *file, struct page *page) +{ + char *buffer, *name; + long long start; + int err; + + start = page->index << PAGE_CACHE_SHIFT; + buffer = kmap(page); + name = inode_name(page->mapping->host, 0); + if(name == NULL) return(-ENOMEM); + err = do_readlink(name, buffer, PAGE_CACHE_SIZE); + kfree(name); + if(err == PAGE_CACHE_SIZE) + err = -E2BIG; + else if(err > 0){ + flush_dcache_page(page); + SetPageUptodate(page); + if (PageError(page)) ClearPageError(page); + err = 0; + } + kunmap(page); + unlock_page(page); + return(err); +} + +static struct address_space_operations hostfs_link_aops = { + .readpage = hostfs_link_readpage, +}; + +static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) +{ + struct inode *root_inode; + char *name, *data = d; + int err; + + sb->s_blocksize = 1024; + sb->s_blocksize_bits = 10; + sb->s_magic = HOSTFS_SUPER_MAGIC; + sb->s_op = &hostfs_sbops; + + if((data == NULL) || (*data == '\0')) + data = root_ino; + + err = -ENOMEM; + name = kmalloc(strlen(data) + 1, GFP_KERNEL); + if(name == NULL) + goto out; + + strcpy(name, data); + + root_inode = iget(sb, 0); + if(root_inode == NULL) + goto out_free; + + err = init_inode(root_inode, NULL); + if(err) + goto out_put; + + HOSTFS_I(root_inode)->host_filename = name; + + err = -ENOMEM; + sb->s_root = d_alloc_root(root_inode); + if(sb->s_root == NULL) + goto out_put; + + err = read_inode(root_inode); + if(err) + goto out_put; + + return(0); + + out_put: + iput(root_inode); + out_free: + kfree(name); + out: + return(err); +} + +static struct super_block *hostfs_read_sb(struct file_system_type *type, + int flags, const char *dev_name, + void *data) +{ + return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common)); +} + +static struct file_system_type hostfs_type = { + .owner = THIS_MODULE, + .name = "hostfs", + .get_sb = hostfs_read_sb, + .kill_sb = kill_anon_super, + .fs_flags = 0, +}; + +static int __init init_hostfs(void) +{ + return(register_filesystem(&hostfs_type)); +} + +static void __exit exit_hostfs(void) +{ + unregister_filesystem(&hostfs_type); +} + +module_init(init_hostfs) +module_exit(exit_hostfs) +MODULE_LICENSE("GPL"); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/hostfs/hostfs_user.c Fri Oct 31 14:10:54 2003 @@ -0,0 +1,361 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <unistd.h> +#include <stdio.h> +#include <fcntl.h> +#include <dirent.h> +#include <errno.h> +#include <utime.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/vfs.h> +#include "hostfs.h" +#include "kern_util.h" +#include "user.h" + +int stat_file(const char *path, unsigned long long *inode_out, int *mode_out, + int *nlink_out, int *uid_out, int *gid_out, + unsigned long long *size_out, struct timespec *atime_out, + struct timespec *mtime_out, struct timespec *ctime_out, + int *blksize_out, unsigned long long *blocks_out) +{ + struct stat64 buf; + + if(lstat64(path, &buf) < 0) + return(-errno); + + /* See the Makefile for why STAT64_INO_FIELD is passed in + * by the build + */ + if(inode_out != NULL) *inode_out = buf.STAT64_INO_FIELD; + if(mode_out != NULL) *mode_out = buf.st_mode; + if(nlink_out != NULL) *nlink_out = buf.st_nlink; + if(uid_out != NULL) *uid_out = buf.st_uid; + if(gid_out != NULL) *gid_out = buf.st_gid; + if(size_out != NULL) *size_out = buf.st_size; + if(atime_out != NULL) { + atime_out->tv_sec = buf.st_atime; + atime_out->tv_nsec = 0; + } + if(mtime_out != NULL) { + mtime_out->tv_sec = buf.st_mtime; + mtime_out->tv_nsec = 0; + } + if(ctime_out != NULL) { + ctime_out->tv_sec = buf.st_ctime; + ctime_out->tv_nsec = 0; + } + if(blksize_out != NULL) *blksize_out = buf.st_blksize; + if(blocks_out != NULL) *blocks_out = buf.st_blocks; + return(0); +} + +int file_type(const char *path, int *rdev) +{ + struct stat64 buf; + + if(lstat64(path, &buf) < 0) + return(-errno); + if(rdev != NULL) + *rdev = buf.st_rdev; + + if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR); + else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK); + else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV); + else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV); + else if(S_ISFIFO(buf.st_mode))return(OS_TYPE_FIFO); + else if(S_ISSOCK(buf.st_mode))return(OS_TYPE_SOCK); + else return(OS_TYPE_FILE); +} + +int access_file(char *path, int r, int w, int x) +{ + int mode = 0; + + if(r) mode = R_OK; + if(w) mode |= W_OK; + if(x) mode |= X_OK; + if(access(path, mode) != 0) return(-errno); + else return(0); +} + +int open_file(char *path, int r, int w, int append) +{ + int mode = 0, fd; + + if(r && !w) + mode = O_RDONLY; + else if(!r && w) + mode = O_WRONLY; + else if(r && w) + mode = O_RDWR; + else panic("Impossible mode in open_file"); + + if(append) + mode |= O_APPEND; + fd = open64(path, mode); + if(fd < 0) return(-errno); + else return(fd); +} + +void *open_dir(char *path, int *err_out) +{ + DIR *dir; + + dir = opendir(path); + *err_out = errno; + if(dir == NULL) return(NULL); + return(dir); +} + +char *read_dir(void *stream, unsigned long long *pos, + unsigned long long *ino_out, int *len_out) +{ + DIR *dir = stream; + struct dirent *ent; + + seekdir(dir, *pos); + ent = readdir(dir); + if(ent == NULL) return(NULL); + *len_out = strlen(ent->d_name); + *ino_out = ent->d_ino; + *pos = telldir(dir); + return(ent->d_name); +} + +int read_file(int fd, unsigned long long *offset, char *buf, int len) +{ + int n; + + n = pread64(fd, buf, len, *offset); + if(n < 0) return(-errno); + *offset += n; + return(n); +} + +int write_file(int fd, unsigned long long *offset, const char *buf, int len) +{ + int n; + + n = pwrite64(fd, buf, len, *offset); + if(n < 0) return(-errno); + *offset += n; + return(n); +} + +int lseek_file(int fd, long long offset, int whence) +{ + int ret; + + ret = lseek64(fd, offset, whence); + if(ret < 0) return(-errno); + return(0); +} + +void close_file(void *stream) +{ + close(*((int *) stream)); +} + +void close_dir(void *stream) +{ + closedir(stream); +} + +int file_create(char *name, int ur, int uw, int ux, int gr, + int gw, int gx, int or, int ow, int ox) +{ + int mode, fd; + + mode = 0; + mode |= ur ? S_IRUSR : 0; + mode |= uw ? S_IWUSR : 0; + mode |= ux ? S_IXUSR : 0; + mode |= gr ? S_IRGRP : 0; + mode |= gw ? S_IWGRP : 0; + mode |= gx ? S_IXGRP : 0; + mode |= or ? S_IROTH : 0; + mode |= ow ? S_IWOTH : 0; + mode |= ox ? S_IXOTH : 0; + fd = open64(name, O_CREAT | O_RDWR, mode); + if(fd < 0) + return(-errno); + return(fd); +} + +int set_attr(const char *file, struct hostfs_iattr *attrs) +{ + struct utimbuf buf; + int err, ma; + + if(attrs->ia_valid & HOSTFS_ATTR_MODE){ + if(chmod(file, attrs->ia_mode) != 0) return(-errno); + } + if(attrs->ia_valid & HOSTFS_ATTR_UID){ + if(chown(file, attrs->ia_uid, -1)) return(-errno); + } + if(attrs->ia_valid & HOSTFS_ATTR_GID){ + if(chown(file, -1, attrs->ia_gid)) return(-errno); + } + if(attrs->ia_valid & HOSTFS_ATTR_SIZE){ + if(truncate(file, attrs->ia_size)) return(-errno); + } + ma = HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET; + if((attrs->ia_valid & ma) == ma){ + buf.actime = attrs->ia_atime.tv_sec; + buf.modtime = attrs->ia_mtime.tv_sec; + if(utime(file, &buf) != 0) return(-errno); + } + else { + struct timespec ts; + + if(attrs->ia_valid & HOSTFS_ATTR_ATIME_SET){ + err = stat_file(file, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, &ts, NULL, NULL, NULL); + if(err != 0) + return(err); + buf.actime = attrs->ia_atime.tv_sec; + buf.modtime = ts.tv_sec; + if(utime(file, &buf) != 0) + return(-errno); + } + if(attrs->ia_valid & HOSTFS_ATTR_MTIME_SET){ + err = stat_file(file, NULL, NULL, NULL, NULL, NULL, + NULL, &ts, NULL, NULL, NULL, NULL); + if(err != 0) + return(err); + buf.actime = ts.tv_sec; + buf.modtime = attrs->ia_mtime.tv_sec; + if(utime(file, &buf) != 0) + return(-errno); + } + } + if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ; + if(attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)){ + err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, + &attrs->ia_atime, &attrs->ia_mtime, NULL, + NULL, NULL); + if(err != 0) return(err); + } + return(0); +} + +int make_symlink(const char *from, const char *to) +{ + int err; + + err = symlink(to, from); + if(err) return(-errno); + return(0); +} + +int unlink_file(const char *file) +{ + int err; + + err = unlink(file); + if(err) return(-errno); + return(0); +} + +int do_mkdir(const char *file, int mode) +{ + int err; + + err = mkdir(file, mode); + if(err) return(-errno); + return(0); +} + +int do_rmdir(const char *file) +{ + int err; + + err = rmdir(file); + if(err) return(-errno); + return(0); +} + +int do_mknod(const char *file, int mode, int dev) +{ + int err; + + err = mknod(file, mode, dev); + if(err) return(-errno); + return(0); +} + +int link_file(const char *to, const char *from) +{ + int err; + + err = link(to, from); + if(err) return(-errno); + return(0); +} + +int do_readlink(char *file, char *buf, int size) +{ + int n; + + n = readlink(file, buf, size); + if(n < 0) + return(-errno); + if(n < size) + buf[n] = '\0'; + return(n); +} + +int rename_file(char *from, char *to) +{ + int err; + + err = rename(from, to); + if(err < 0) return(-errno); + return(0); +} + +int do_statfs(char *root, long *bsize_out, long long *blocks_out, + long long *bfree_out, long long *bavail_out, + long long *files_out, long long *ffree_out, + void *fsid_out, int fsid_size, long *namelen_out, + long *spare_out) +{ + struct statfs64 buf; + int err; + + err = statfs64(root, &buf); + if(err < 0) return(-errno); + *bsize_out = buf.f_bsize; + *blocks_out = buf.f_blocks; + *bfree_out = buf.f_bfree; + *bavail_out = buf.f_bavail; + *files_out = buf.f_files; + *ffree_out = buf.f_ffree; + memcpy(fsid_out, &buf.f_fsid, + sizeof(buf.f_fsid) > fsid_size ? fsid_size : + sizeof(buf.f_fsid)); + *namelen_out = buf.f_namelen; + spare_out[0] = buf.f_spare[0]; + spare_out[1] = buf.f_spare[1]; + spare_out[2] = buf.f_spare[2]; + spare_out[3] = buf.f_spare[3]; + spare_out[4] = buf.f_spare[4]; + spare_out[5] = buf.f_spare[5]; + return(0); +} + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/fs/hppfs/Makefile b/fs/hppfs/Makefile --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/hppfs/Makefile Fri Oct 31 14:10:54 2003 @@ -0,0 +1,19 @@ +# +# Copyright (C) 2002, 2003 Jeff Dike (jdike@karaya.com) +# Licensed under the GPL +# + +hppfs-objs := hppfs_kern.o + +obj-y = +obj-$(CONFIG_HPPFS) += hppfs.o + +clean: + +modules: + +fastdep: + +dep: + +archmrproper: clean diff -Nru a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/hppfs/hppfs_kern.c Fri Oct 31 14:10:55 2003 @@ -0,0 +1,811 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/kernel.h> +#include <linux/ctype.h> +#include <linux/dcache.h> +#include <linux/statfs.h> +#include <asm/uaccess.h> +#include <asm/fcntl.h> +#include "os.h" + +static int init_inode(struct inode *inode, struct dentry *dentry); + +struct hppfs_data { + struct list_head list; + char contents[PAGE_SIZE - sizeof(struct list_head)]; +}; + +struct hppfs_private { + struct file proc_file; + int host_fd; + loff_t len; + struct hppfs_data *contents; +}; + +struct hppfs_inode_info { + struct dentry *proc_dentry; + struct inode vfs_inode; +}; + +static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) +{ + return(list_entry(inode, struct hppfs_inode_info, vfs_inode)); +} + +#define HPPFS_SUPER_MAGIC 0xb00000ee + +static struct super_operations hppfs_sbops; + +static int is_pid(struct dentry *dentry) +{ + struct super_block *sb; + int i; + + sb = dentry->d_sb; + if((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root)) + return(0); + + for(i = 0; i < dentry->d_name.len; i++){ + if(!isdigit(dentry->d_name.name[i])) + return(0); + } + return(1); +} + +static char *dentry_name(struct dentry *dentry, int extra) +{ + struct dentry *parent; + char *root, *name; + const char *seg_name; + int len, seg_len; + + len = 0; + parent = dentry; + while(parent->d_parent != parent){ + if(is_pid(parent)) + len += strlen("pid") + 1; + else len += parent->d_name.len + 1; + parent = parent->d_parent; + } + + root = "proc"; + len += strlen(root); + name = kmalloc(len + extra + 1, GFP_KERNEL); + if(name == NULL) return(NULL); + + name[len] = '\0'; + parent = dentry; + while(parent->d_parent != parent){ + if(is_pid(parent)){ + seg_name = "pid"; + seg_len = strlen("pid"); + } + else { + seg_name = parent->d_name.name; + seg_len = parent->d_name.len; + } + + len -= seg_len + 1; + name[len] = '/'; + strncpy(&name[len + 1], seg_name, seg_len); + parent = parent->d_parent; + } + strncpy(name, root, strlen(root)); + return(name); +} + +struct dentry_operations hppfs_dentry_ops = { +}; + +static int file_removed(struct dentry *dentry, const char *file) +{ + char *host_file; + int extra, fd; + + extra = 0; + if(file != NULL) extra += strlen(file) + 1; + + host_file = dentry_name(dentry, extra + strlen("/remove")); + if(host_file == NULL){ + printk("file_removed : allocation failed\n"); + return(-ENOMEM); + } + + if(file != NULL){ + strcat(host_file, "/"); + strcat(host_file, file); + } + strcat(host_file, "/remove"); + + fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); + kfree(host_file); + if(fd > 0){ + os_close_file(fd); + return(1); + } + return(0); +} + +static void hppfs_read_inode(struct inode *ino) +{ + struct inode *proc_ino; + + if(HPPFS_I(ino)->proc_dentry == NULL) + return; + + proc_ino = HPPFS_I(ino)->proc_dentry->d_inode; + ino->i_uid = proc_ino->i_uid; + ino->i_gid = proc_ino->i_gid; + ino->i_atime = proc_ino->i_atime; + ino->i_mtime = proc_ino->i_mtime; + ino->i_ctime = proc_ino->i_ctime; + ino->i_ino = proc_ino->i_ino; + ino->i_mode = proc_ino->i_mode; + ino->i_nlink = proc_ino->i_nlink; + ino->i_size = proc_ino->i_size; + ino->i_blksize = proc_ino->i_blksize; + ino->i_blocks = proc_ino->i_blocks; +} + +static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, + struct nameidata *nd) +{ + struct dentry *proc_dentry, *new, *parent; + struct inode *inode; + int err, deleted; + + deleted = file_removed(dentry, NULL); + if(deleted < 0) + return(ERR_PTR(deleted)); + else if(deleted) + return(ERR_PTR(-ENOENT)); + + err = -ENOMEM; + parent = HPPFS_I(ino)->proc_dentry; + down(&parent->d_inode->i_sem); + proc_dentry = d_lookup(parent, &dentry->d_name); + if(proc_dentry == NULL){ + proc_dentry = d_alloc(parent, &dentry->d_name); + if(proc_dentry == NULL){ + up(&parent->d_inode->i_sem); + goto out; + } + new = (*parent->d_inode->i_op->lookup)(parent->d_inode, + proc_dentry, NULL); + if(new){ + dput(proc_dentry); + proc_dentry = new; + } + } + up(&parent->d_inode->i_sem); + + if(IS_ERR(proc_dentry)) + return(proc_dentry); + + inode = iget(ino->i_sb, 0); + if(inode == NULL) + goto out_dput; + + err = init_inode(inode, proc_dentry); + if(err) + goto out_put; + + hppfs_read_inode(inode); + + d_add(dentry, inode); + dentry->d_op = &hppfs_dentry_ops; + return(NULL); + + out_put: + iput(inode); + out_dput: + dput(proc_dentry); + out: + return(ERR_PTR(err)); +} + +static struct inode_operations hppfs_file_iops = { +}; + +static ssize_t read_proc(struct file *file, char *buf, ssize_t count, + loff_t *ppos, int is_user) +{ + ssize_t (*read)(struct file *, char *, size_t, loff_t *); + ssize_t n; + + read = file->f_dentry->d_inode->i_fop->read; + + if(!is_user) + set_fs(KERNEL_DS); + + n = (*read)(file, buf, count, &file->f_pos); + + if(!is_user) + set_fs(USER_DS); + + if(ppos) *ppos = file->f_pos; + return(n); +} + +static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) +{ + ssize_t n; + int cur, err; + char *new_buf; + + n = -ENOMEM; + new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if(new_buf == NULL){ + printk("hppfs_read_file : kmalloc failed\n"); + goto out; + } + n = 0; + while(count > 0){ + cur = min_t(ssize_t, count, PAGE_SIZE); + err = os_read_file(fd, new_buf, cur); + if(err < 0){ + printk("hppfs_read : read failed, errno = %d\n", + count); + n = err; + goto out_free; + } + else if(err == 0) + break; + + if(copy_to_user(buf, new_buf, err)){ + n = -EFAULT; + goto out_free; + } + n += err; + count -= err; + } + out_free: + kfree(new_buf); + out: + return(n); +} + +static ssize_t hppfs_read(struct file *file, char *buf, size_t count, + loff_t *ppos) +{ + struct hppfs_private *hppfs = file->private_data; + struct hppfs_data *data; + loff_t off; + int err; + + if(hppfs->contents != NULL){ + if(*ppos >= hppfs->len) return(0); + + data = hppfs->contents; + off = *ppos; + while(off >= sizeof(data->contents)){ + data = list_entry(data->list.next, struct hppfs_data, + list); + off -= sizeof(data->contents); + } + + if(off + count > hppfs->len) + count = hppfs->len - off; + copy_to_user(buf, &data->contents[off], count); + *ppos += count; + } + else if(hppfs->host_fd != -1){ + err = os_seek_file(hppfs->host_fd, *ppos); + if(err){ + printk("hppfs_read : seek failed, errno = %d\n", err); + return(err); + } + count = hppfs_read_file(hppfs->host_fd, buf, count); + if(count > 0) + *ppos += count; + } + else count = read_proc(&hppfs->proc_file, buf, count, ppos, 1); + + return(count); +} + +static ssize_t hppfs_write(struct file *file, const char *buf, size_t len, + loff_t *ppos) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = &data->proc_file; + ssize_t (*write)(struct file *, const char *, size_t, loff_t *); + int err; + + write = proc_file->f_dentry->d_inode->i_fop->write; + + proc_file->f_pos = file->f_pos; + err = (*write)(proc_file, buf, len, &proc_file->f_pos); + file->f_pos = proc_file->f_pos; + + return(err); +} + +static int open_host_sock(char *host_file, int *filter_out) +{ + char *end; + int fd; + + end = &host_file[strlen(host_file)]; + strcpy(end, "/rw"); + *filter_out = 1; + fd = os_connect_socket(host_file); + if(fd > 0) + return(fd); + + strcpy(end, "/r"); + *filter_out = 0; + fd = os_connect_socket(host_file); + return(fd); +} + +static void free_contents(struct hppfs_data *head) +{ + struct hppfs_data *data; + struct list_head *ele, *next; + + if(head == NULL) return; + + list_for_each_safe(ele, next, &head->list){ + data = list_entry(ele, struct hppfs_data, list); + kfree(data); + } + kfree(head); +} + +static struct hppfs_data *hppfs_get_data(int fd, int filter, + struct file *proc_file, + struct file *hppfs_file, + loff_t *size_out) +{ + struct hppfs_data *data, *new, *head; + int n, err; + + err = -ENOMEM; + data = kmalloc(sizeof(*data), GFP_KERNEL); + if(data == NULL){ + printk("hppfs_get_data : head allocation failed\n"); + goto failed; + } + + INIT_LIST_HEAD(&data->list); + + head = data; + *size_out = 0; + + if(filter){ + while((n = read_proc(proc_file, data->contents, + sizeof(data->contents), NULL, 0)) > 0) + os_write_file(fd, data->contents, n); + err = os_shutdown_socket(fd, 0, 1); + if(err){ + printk("hppfs_get_data : failed to shut down " + "socket\n"); + goto failed_free; + } + } + while(1){ + n = os_read_file(fd, data->contents, sizeof(data->contents)); + if(n < 0){ + err = n; + printk("hppfs_get_data : read failed, errno = %d\n", + err); + goto failed_free; + } + else if(n == 0) + break; + + *size_out += n; + + if(n < sizeof(data->contents)) + break; + + new = kmalloc(sizeof(*data), GFP_KERNEL); + if(new == 0){ + printk("hppfs_get_data : data allocation failed\n"); + err = -ENOMEM; + goto failed_free; + } + + INIT_LIST_HEAD(&new->list); + list_add(&new->list, &data->list); + data = new; + } + return(head); + + failed_free: + free_contents(head); + failed: + return(ERR_PTR(err)); +} + +static struct hppfs_private *hppfs_data(void) +{ + struct hppfs_private *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if(data == NULL) + return(data); + + *data = ((struct hppfs_private ) { .host_fd = -1, + .len = -1, + .contents = NULL } ); + return(data); +} + +static int file_mode(int fmode) +{ + if(fmode == (FMODE_READ | FMODE_WRITE)) + return(O_RDWR); + if(fmode == FMODE_READ) + return(O_RDONLY); + if(fmode == FMODE_WRITE) + return(O_WRONLY); + return(0); +} + +static int hppfs_open(struct inode *inode, struct file *file) +{ + struct hppfs_private *data; + struct dentry *proc_dentry; + char *host_file; + int err, fd, type, filter; + + err = -ENOMEM; + data = hppfs_data(); + if(data == NULL) + goto out; + + host_file = dentry_name(file->f_dentry, strlen("/rw")); + if(host_file == NULL) + goto out_free2; + + proc_dentry = HPPFS_I(inode)->proc_dentry; + + /* XXX This isn't closed anywhere */ + err = open_private_file(&data->proc_file, proc_dentry, + file_mode(file->f_mode)); + if(err) + goto out_free1; + + type = os_file_type(host_file); + if(type == OS_TYPE_FILE){ + fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); + if(fd >= 0) + data->host_fd = fd; + else printk("hppfs_open : failed to open '%s', errno = %d\n", + host_file, -fd); + + data->contents = NULL; + } + else if(type == OS_TYPE_DIR){ + fd = open_host_sock(host_file, &filter); + if(fd > 0){ + data->contents = hppfs_get_data(fd, filter, + &data->proc_file, + file, &data->len); + if(!IS_ERR(data->contents)) + data->host_fd = fd; + } + else printk("hppfs_open : failed to open a socket in " + "'%s', errno = %d\n", host_file, -fd); + } + kfree(host_file); + + file->private_data = data; + return(0); + + out_free1: + kfree(host_file); + out_free2: + free_contents(data->contents); + kfree(data); + out: + return(err); +} + +static int hppfs_dir_open(struct inode *inode, struct file *file) +{ + struct hppfs_private *data; + struct dentry *proc_dentry; + int err; + + err = -ENOMEM; + data = hppfs_data(); + if(data == NULL) + goto out; + + proc_dentry = HPPFS_I(inode)->proc_dentry; + err = open_private_file(&data->proc_file, proc_dentry, + file_mode(file->f_mode)); + if(err) + goto out_free; + + file->private_data = data; + return(0); + + out_free: + kfree(data); + out: + return(err); +} + +static loff_t hppfs_llseek(struct file *file, loff_t off, int where) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = &data->proc_file; + loff_t (*llseek)(struct file *, loff_t, int); + loff_t ret; + + llseek = proc_file->f_dentry->d_inode->i_fop->llseek; + if(llseek != NULL){ + ret = (*llseek)(proc_file, off, where); + if(ret < 0) + return(ret); + } + + return(default_llseek(file, off, where)); +} + +static struct file_operations hppfs_file_fops = { + .owner = NULL, + .llseek = hppfs_llseek, + .read = hppfs_read, + .write = hppfs_write, + .open = hppfs_open, +}; + +struct hppfs_dirent { + void *vfs_dirent; + filldir_t filldir; + struct dentry *dentry; +}; + +static int hppfs_filldir(void *d, const char *name, int size, + loff_t offset, ino_t inode, unsigned int type) +{ + struct hppfs_dirent *dirent = d; + + if(file_removed(dirent->dentry, name)) + return(0); + + return((*dirent->filldir)(dirent->vfs_dirent, name, size, offset, + inode, type)); +} + +static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) +{ + struct hppfs_private *data = file->private_data; + struct file *proc_file = &data->proc_file; + int (*readdir)(struct file *, void *, filldir_t); + struct hppfs_dirent dirent = ((struct hppfs_dirent) + { .vfs_dirent = ent, + .filldir = filldir, + .dentry = file->f_dentry } ); + int err; + + readdir = proc_file->f_dentry->d_inode->i_fop->readdir; + + proc_file->f_pos = file->f_pos; + err = (*readdir)(proc_file, &dirent, hppfs_filldir); + file->f_pos = proc_file->f_pos; + + return(err); +} + +static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + return(0); +} + +static struct file_operations hppfs_dir_fops = { + .owner = NULL, + .readdir = hppfs_readdir, + .open = hppfs_dir_open, + .fsync = hppfs_fsync, +}; + +static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf) +{ + sf->f_blocks = 0; + sf->f_bfree = 0; + sf->f_bavail = 0; + sf->f_files = 0; + sf->f_ffree = 0; + sf->f_type = HPPFS_SUPER_MAGIC; + return(0); +} + +static struct inode *hppfs_alloc_inode(struct super_block *sb) +{ + struct hppfs_inode_info *hi; + + hi = kmalloc(sizeof(*hi), GFP_KERNEL); + if(hi == NULL) + return(NULL); + + *hi = ((struct hppfs_inode_info) { .proc_dentry = NULL }); + inode_init_once(&hi->vfs_inode); + return(&hi->vfs_inode); +} + +void hppfs_delete_inode(struct inode *ino) +{ + clear_inode(ino); +} + +static void hppfs_destroy_inode(struct inode *inode) +{ + kfree(HPPFS_I(inode)); +} + +static struct super_operations hppfs_sbops = { + .alloc_inode = hppfs_alloc_inode, + .destroy_inode = hppfs_destroy_inode, + .read_inode = hppfs_read_inode, + .delete_inode = hppfs_delete_inode, + .statfs = hppfs_statfs, +}; + +static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + struct file proc_file; + struct dentry *proc_dentry; + int (*readlink)(struct dentry *, char *, int); + int err, n; + + proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + err = open_private_file(&proc_file, proc_dentry, O_RDONLY); + if(err) + return(err); + + readlink = proc_dentry->d_inode->i_op->readlink; + n = (*readlink)(proc_dentry, buffer, buflen); + + close_private_file(&proc_file); + + return(n); +} + +static int hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct file proc_file; + struct dentry *proc_dentry; + int (*follow_link)(struct dentry *, struct nameidata *); + int err, n; + + proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + err = open_private_file(&proc_file, proc_dentry, O_RDONLY); + if(err) + return(err); + + follow_link = proc_dentry->d_inode->i_op->follow_link; + n = (*follow_link)(proc_dentry, nd); + + close_private_file(&proc_file); + + return(n); +} + +static struct inode_operations hppfs_dir_iops = { + .lookup = hppfs_lookup, +}; + +static struct inode_operations hppfs_link_iops = { + .readlink = hppfs_readlink, + .follow_link = hppfs_follow_link, +}; + +static int init_inode(struct inode *inode, struct dentry *dentry) +{ + if(S_ISDIR(dentry->d_inode->i_mode)){ + inode->i_op = &hppfs_dir_iops; + inode->i_fop = &hppfs_dir_fops; + } + else if(S_ISLNK(dentry->d_inode->i_mode)){ + inode->i_op = &hppfs_link_iops; + inode->i_fop = &hppfs_file_fops; + } + else { + inode->i_op = &hppfs_file_iops; + inode->i_fop = &hppfs_file_fops; + } + + HPPFS_I(inode)->proc_dentry = dentry; + + return(0); +} + +static int hppfs_fill_super(struct super_block *sb, void *d, int silent) +{ + struct inode *root_inode; + struct file_system_type *procfs; + struct super_block *proc_sb; + int err; + + err = -ENOENT; + procfs = get_fs_type("proc"); + if(procfs == NULL) + goto out; + + if(list_empty(&procfs->fs_supers)) + goto out; + + proc_sb = list_entry(procfs->fs_supers.next, struct super_block, + s_instances); + + sb->s_blocksize = 1024; + sb->s_blocksize_bits = 10; + sb->s_magic = HPPFS_SUPER_MAGIC; + sb->s_op = &hppfs_sbops; + + root_inode = iget(sb, 0); + if(root_inode == NULL) + goto out; + + err = init_inode(root_inode, proc_sb->s_root); + if(err) + goto out_put; + + err = -ENOMEM; + sb->s_root = d_alloc_root(root_inode); + if(sb->s_root == NULL) + goto out_put; + + hppfs_read_inode(root_inode); + + return(0); + + out_put: + iput(root_inode); + out: + return(err); +} + +static struct super_block *hppfs_read_super(struct file_system_type *type, + int flags, const char *dev_name, + void *data) +{ + return(get_sb_nodev(type, flags, data, hppfs_fill_super)); +} + +static struct file_system_type hppfs_type = { + .owner = THIS_MODULE, + .name = "hppfs", + .get_sb = hppfs_read_super, + .kill_sb = kill_anon_super, + .fs_flags = 0, +}; + +static int __init init_hppfs(void) +{ + return(register_filesystem(&hppfs_type)); +} + +static void __exit exit_hppfs(void) +{ + unregister_filesystem(&hppfs_type); +} + +module_init(init_hppfs) +module_exit(exit_hppfs) +MODULE_LICENSE("GPL"); + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/fs/inode.c b/fs/inode.c --- a/fs/inode.c Fri Oct 31 14:10:54 2003 +++ b/fs/inode.c Fri Oct 31 14:10:54 2003 @@ -1012,7 +1012,7 @@ EXPORT_SYMBOL(generic_delete_inode); -static void generic_forget_inode(struct inode *inode) +void generic_forget_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1039,6 +1039,7 @@ clear_inode(inode); destroy_inode(inode); } +EXPORT_SYMBOL(generic_forget_inode); /* * Normal UNIX filesystem behaviour: delete the diff -Nru a/fs/inode.c~export-generic_forget_inode.diff b/fs/inode.c~export-generic_forget_inode.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/inode.c~export-generic_forget_inode.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,1406 @@ +/* + * linux/fs/inode.c + * + * (C) 1997 Linus Torvalds + */ + +#include <linux/config.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/dcache.h> +#include <linux/init.h> +#include <linux/quotaops.h> +#include <linux/slab.h> +#include <linux/writeback.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/wait.h> +#include <linux/hash.h> +#include <linux/swap.h> +#include <linux/security.h> +#include <linux/pagemap.h> +#include <linux/cdev.h> + +/* + * This is needed for the following functions: + * - inode_has_buffers + * - invalidate_inode_buffers + * - fsync_bdev + * - invalidate_bdev + * + * FIXME: remove all knowledge of the buffer layer from this file + */ +#include <linux/buffer_head.h> + +/* + * New inode.c implementation. + * + * This implementation has the basic premise of trying + * to be extremely low-overhead and SMP-safe, yet be + * simple enough to be "obviously correct". + * + * Famous last words. + */ + +/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */ + +/* #define INODE_PARANOIA 1 */ +/* #define INODE_DEBUG 1 */ + +/* + * Inode lookup is no longer as critical as it used to be: + * most of the lookups are going to be through the dcache. + */ +#define I_HASHBITS i_hash_shift +#define I_HASHMASK i_hash_mask + +static unsigned int i_hash_mask; +static unsigned int i_hash_shift; + +/* + * Each inode can be on two separate lists. One is + * the hash list of the inode, used for lookups. The + * other linked list is the "type" list: + * "in_use" - valid inode, i_count > 0, i_nlink > 0 + * "dirty" - as "in_use" but also dirty + * "unused" - valid inode, i_count = 0 + * + * A "dirty" list is maintained for each super block, + * allowing for low-overhead inode sync() operations. + */ + +LIST_HEAD(inode_in_use); +LIST_HEAD(inode_unused); +static struct hlist_head *inode_hashtable; + +/* + * A simple spinlock to protect the list manipulations. + * + * NOTE! You also have to own the lock if you change + * the i_state of an inode while it is in use.. + */ +spinlock_t inode_lock = SPIN_LOCK_UNLOCKED; + +/* + * iprune_sem provides exclusion between the kswapd or try_to_free_pages + * icache shrinking path, and the umount path. Without this exclusion, + * by the time prune_icache calls iput for the inode whose pages it has + * been invalidating, or by the time it calls clear_inode & destroy_inode + * from its final dispose_list, the struct super_block they refer to + * (for inode->i_sb->s_op) may already have been freed and reused. + */ +static DECLARE_MUTEX(iprune_sem); + +/* + * Statistics gathering.. + */ +struct inodes_stat_t inodes_stat; + +static kmem_cache_t * inode_cachep; + +static struct inode *alloc_inode(struct super_block *sb) +{ + static struct address_space_operations empty_aops; + static struct inode_operations empty_iops; + static struct file_operations empty_fops; + struct inode *inode; + + if (sb->s_op->alloc_inode) + inode = sb->s_op->alloc_inode(sb); + else + inode = (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL); + + if (inode) { + struct address_space * const mapping = &inode->i_data; + + inode->i_sb = sb; + inode->i_blkbits = sb->s_blocksize_bits; + inode->i_flags = 0; + atomic_set(&inode->i_count, 1); + inode->i_sock = 0; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; + inode->i_nlink = 1; + atomic_set(&inode->i_writecount, 0); + inode->i_size = 0; + inode->i_blocks = 0; + inode->i_bytes = 0; + inode->i_generation = 0; + memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); + inode->i_pipe = NULL; + inode->i_bdev = NULL; + inode->i_cdev = NULL; + inode->i_rdev = 0; + inode->i_security = NULL; + if (security_inode_alloc(inode)) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); + return NULL; + } + + mapping->a_ops = &empty_aops; + mapping->host = inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_HIGHUSER); + mapping->dirtied_when = 0; + mapping->assoc_mapping = NULL; + mapping->backing_dev_info = &default_backing_dev_info; + if (sb->s_bdev) + mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; + memset(&inode->u, 0, sizeof(inode->u)); + inode->i_mapping = mapping; + } + return inode; +} + +void destroy_inode(struct inode *inode) +{ + if (inode_has_buffers(inode)) + BUG(); + security_inode_free(inode); + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, (inode)); +} + + +/* + * These are initializations that only need to be done + * once, because the fields are idempotent across use + * of the inode, so let the slab aware of that. + */ +void inode_init_once(struct inode *inode) +{ + memset(inode, 0, sizeof(*inode)); + INIT_HLIST_NODE(&inode->i_hash); + INIT_LIST_HEAD(&inode->i_data.clean_pages); + INIT_LIST_HEAD(&inode->i_data.dirty_pages); + INIT_LIST_HEAD(&inode->i_data.locked_pages); + INIT_LIST_HEAD(&inode->i_data.io_pages); + INIT_LIST_HEAD(&inode->i_dentry); + INIT_LIST_HEAD(&inode->i_devices); + sema_init(&inode->i_sem, 1); + INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); + spin_lock_init(&inode->i_data.page_lock); + init_MUTEX(&inode->i_data.i_shared_sem); + atomic_set(&inode->i_data.truncate_count, 0); + INIT_LIST_HEAD(&inode->i_data.private_list); + spin_lock_init(&inode->i_data.private_lock); + INIT_LIST_HEAD(&inode->i_data.i_mmap); + INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + spin_lock_init(&inode->i_lock); + i_size_ordered_init(inode); +} + +EXPORT_SYMBOL(inode_init_once); + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct inode * inode = (struct inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(inode); +} + +/* + * inode_lock must be held + */ +void __iget(struct inode * inode) +{ + if (atomic_read(&inode->i_count)) { + atomic_inc(&inode->i_count); + return; + } + atomic_inc(&inode->i_count); + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_in_use); + } + inodes_stat.nr_unused--; +} + +/** + * clear_inode - clear an inode + * @inode: inode to clear + * + * This is called by the filesystem to tell us + * that the inode is no longer useful. We just + * terminate it with extreme prejudice. + */ +void clear_inode(struct inode *inode) +{ + invalidate_inode_buffers(inode); + + if (inode->i_data.nrpages) + BUG(); + if (!(inode->i_state & I_FREEING)) + BUG(); + if (inode->i_state & I_CLEAR) + BUG(); + wait_on_inode(inode); + DQUOT_DROP(inode); + if (inode->i_sb && inode->i_sb->s_op->clear_inode) + inode->i_sb->s_op->clear_inode(inode); + if (inode->i_bdev) + bd_forget(inode); + if (inode->i_cdev) + cd_forget(inode); + inode->i_state = I_CLEAR; +} + +EXPORT_SYMBOL(clear_inode); + +/* + * dispose_list - dispose of the contents of a local list + * @head: the head of the list to free + * + * Dispose-list gets a local list with local inodes in it, so it doesn't + * need to worry about list corruption and SMP locks. + */ +static void dispose_list(struct list_head *head) +{ + int nr_disposed = 0; + + while (!list_empty(head)) { + struct inode *inode; + + inode = list_entry(head->next, struct inode, i_list); + list_del(&inode->i_list); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); + nr_disposed++; + } + spin_lock(&inode_lock); + inodes_stat.nr_inodes -= nr_disposed; + spin_unlock(&inode_lock); +} + +/* + * Invalidate all inodes for a device. + */ +static int invalidate_list(struct list_head *head, struct super_block * sb, struct list_head * dispose) +{ + struct list_head *next; + int busy = 0, count = 0; + + next = head->next; + for (;;) { + struct list_head * tmp = next; + struct inode * inode; + + next = next->next; + if (tmp == head) + break; + inode = list_entry(tmp, struct inode, i_list); + if (inode->i_sb != sb) + continue; + invalidate_inode_buffers(inode); + if (!atomic_read(&inode->i_count)) { + hlist_del_init(&inode->i_hash); + list_del(&inode->i_list); + list_add(&inode->i_list, dispose); + inode->i_state |= I_FREEING; + count++; + continue; + } + busy = 1; + } + /* only unused inodes may be cached with i_count zero */ + inodes_stat.nr_unused -= count; + return busy; +} + +/* + * This is a two-stage process. First we collect all + * offending inodes onto the throw-away list, and in + * the second stage we actually dispose of them. This + * is because we don't want to sleep while messing + * with the global lists.. + */ + +/** + * invalidate_inodes - discard the inodes on a device + * @sb: superblock + * + * Discard all of the inodes for a given superblock. If the discard + * fails because there are busy inodes then a non zero value is returned. + * If the discard is successful all the inodes have been discarded. + */ +int invalidate_inodes(struct super_block * sb) +{ + int busy; + LIST_HEAD(throw_away); + + down(&iprune_sem); + spin_lock(&inode_lock); + busy = invalidate_list(&inode_in_use, sb, &throw_away); + busy |= invalidate_list(&inode_unused, sb, &throw_away); + busy |= invalidate_list(&sb->s_dirty, sb, &throw_away); + busy |= invalidate_list(&sb->s_io, sb, &throw_away); + spin_unlock(&inode_lock); + + dispose_list(&throw_away); + up(&iprune_sem); + + return busy; +} + +EXPORT_SYMBOL(invalidate_inodes); + +int __invalidate_device(struct block_device *bdev, int do_sync) +{ + struct super_block *sb; + int res; + + if (do_sync) + fsync_bdev(bdev); + + res = 0; + sb = get_super(bdev); + if (sb) { + /* + * no need to lock the super, get_super holds the + * read semaphore so the filesystem cannot go away + * under us (->put_super runs with the write lock + * hold). + */ + shrink_dcache_sb(sb); + res = invalidate_inodes(sb); + drop_super(sb); + } + invalidate_bdev(bdev, 0); + return res; +} + +EXPORT_SYMBOL(__invalidate_device); + +static int can_unuse(struct inode *inode) +{ + if (inode->i_state) + return 0; + if (inode_has_buffers(inode)) + return 0; + if (atomic_read(&inode->i_count)) + return 0; + if (inode->i_data.nrpages) + return 0; + return 1; +} + +/* + * Scan `goal' inodes on the unused list for freeable ones. They are moved to + * a temporary list and then are freed outside inode_lock by dispose_list(). + * + * Any inodes which are pinned purely because of attached pagecache have their + * pagecache removed. We expect the final iput() on that inode to add it to + * the front of the inode_unused list. So look for it there and if the + * inode is still freeable, proceed. The right inode is found 99.9% of the + * time in testing on a 4-way. + * + * If the inode has metadata buffers attached to mapping->private_list then + * try to remove them. + */ +static void prune_icache(int nr_to_scan) +{ + LIST_HEAD(freeable); + int nr_pruned = 0; + int nr_scanned; + unsigned long reap = 0; + + down(&iprune_sem); + spin_lock(&inode_lock); + for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { + struct inode *inode; + + if (list_empty(&inode_unused)) + break; + + inode = list_entry(inode_unused.prev, struct inode, i_list); + + if (inode->i_state || atomic_read(&inode->i_count)) { + list_move(&inode->i_list, &inode_unused); + continue; + } + if (inode_has_buffers(inode) || inode->i_data.nrpages) { + __iget(inode); + spin_unlock(&inode_lock); + if (remove_inode_buffers(inode)) + reap += invalidate_inode_pages(&inode->i_data); + iput(inode); + spin_lock(&inode_lock); + + if (inode != list_entry(inode_unused.next, + struct inode, i_list)) + continue; /* wrong inode or list_empty */ + if (!can_unuse(inode)) + continue; + } + hlist_del_init(&inode->i_hash); + list_move(&inode->i_list, &freeable); + inode->i_state |= I_FREEING; + nr_pruned++; + } + inodes_stat.nr_unused -= nr_pruned; + spin_unlock(&inode_lock); + + dispose_list(&freeable); + up(&iprune_sem); + + if (current_is_kswapd) + mod_page_state(kswapd_inodesteal, reap); + else + mod_page_state(pginodesteal, reap); +} + +/* + * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, + * "unused" means that no dentries are referring to the inodes: the files are + * not open and the dcache references to those inodes have already been + * reclaimed. + * + * This function is passed the number of inodes to scan, and it returns the + * total number of remaining possibly-reclaimable inodes. + */ +static int shrink_icache_memory(int nr, unsigned int gfp_mask) +{ + if (nr) { + /* + * Nasty deadlock avoidance. We may hold various FS locks, + * and we don't want to recurse into the FS that called us + * in clear_inode() and friends.. + */ + if (gfp_mask & __GFP_FS) + prune_icache(nr); + } + return inodes_stat.nr_unused; +} + +static void __wait_on_freeing_inode(struct inode *inode); +/* + * Called with the inode lock held. + * NOTE: we are not increasing the inode-refcount, you must call __iget() + * by hand after calling find_inode now! This simplifies iunique and won't + * add any additional branch in the common code. + */ +static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) +{ + struct hlist_node *node; + struct inode * inode = NULL; + +repeat: + hlist_for_each (node, head) { + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_sb != sb) + continue; + if (!test(inode, data)) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + break; + } + return node ? inode : NULL; +} + +/* + * find_inode_fast is the fast path version of find_inode, see the comment at + * iget_locked for details. + */ +static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) +{ + struct hlist_node *node; + struct inode * inode = NULL; + +repeat: + hlist_for_each (node, head) { + inode = hlist_entry(node, struct inode, i_hash); + if (inode->i_ino != ino) + continue; + if (inode->i_sb != sb) + continue; + if (inode->i_state & (I_FREEING|I_CLEAR)) { + __wait_on_freeing_inode(inode); + goto repeat; + } + break; + } + return node ? inode : NULL; +} + +/** + * new_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + */ +struct inode *new_inode(struct super_block *sb) +{ + static unsigned long last_ino; + struct inode * inode; + + spin_lock_prefetch(&inode_lock); + + inode = alloc_inode(sb); + if (inode) { + spin_lock(&inode_lock); + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + inode->i_ino = ++last_ino; + inode->i_state = 0; + spin_unlock(&inode_lock); + } + return inode; +} + +EXPORT_SYMBOL(new_inode); + +void unlock_new_inode(struct inode *inode) +{ + /* + * This is special! We do not need the spinlock + * when clearing I_LOCK, because we're guaranteed + * that nobody else tries to do anything about the + * state of the inode when it is locked, as we + * just created it (so there can be no old holders + * that haven't tested I_LOCK). + */ + inode->i_state &= ~(I_LOCK|I_NEW); + wake_up_inode(inode); +} + +EXPORT_SYMBOL(unlock_new_inode); + +/* + * This is called without the inode lock held.. Be careful. + * + * We no longer cache the sb_flags in i_flags - see fs.h + * -- rmk@arm.uk.linux.org + */ +static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode(sb, head, test, data); + if (!old) { + if (set(inode, data)) + goto set_failed; + + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; + +set_failed: + spin_unlock(&inode_lock); + destroy_inode(inode); + return NULL; +} + +/* + * get_new_inode_fast is the fast path version of get_new_inode, see the + * comment at iget_locked for details. + */ +static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) +{ + struct inode * inode; + + inode = alloc_inode(sb); + if (inode) { + struct inode * old; + + spin_lock(&inode_lock); + /* We released the lock, so.. */ + old = find_inode_fast(sb, head, ino); + if (!old) { + inode->i_ino = ino; + inodes_stat.nr_inodes++; + list_add(&inode->i_list, &inode_in_use); + hlist_add_head(&inode->i_hash, head); + inode->i_state = I_LOCK|I_NEW; + spin_unlock(&inode_lock); + + /* Return the locked inode with I_NEW set, the + * caller is responsible for filling in the contents + */ + return inode; + } + + /* + * Uhhuh, somebody else created the same inode under + * us. Use the old inode instead of the one we just + * allocated. + */ + __iget(old); + spin_unlock(&inode_lock); + destroy_inode(inode); + inode = old; + wait_on_inode(inode); + } + return inode; +} + +static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp = hashval + ((unsigned long) sb / L1_CACHE_BYTES); + tmp = tmp + (tmp >> I_HASHBITS); + return tmp & I_HASHMASK; +} + +/* Yeah, I know about quadratic hash. Maybe, later. */ + +/** + * iunique - get a unique inode number + * @sb: superblock + * @max_reserved: highest reserved inode number + * + * Obtain an inode number that is unique on the system for a given + * superblock. This is used by file systems that have no natural + * permanent inode numbering system. An inode number is returned that + * is higher than the reserved limit but unique. + * + * BUGS: + * With a large number of inodes live on the file system this function + * currently becomes quite slow. + */ +ino_t iunique(struct super_block *sb, ino_t max_reserved) +{ + static ino_t counter; + struct inode *inode; + struct hlist_head * head; + ino_t res; + spin_lock(&inode_lock); +retry: + if (counter > max_reserved) { + head = inode_hashtable + hash(sb,counter); + res = counter++; + inode = find_inode_fast(sb, head, res); + if (!inode) { + spin_unlock(&inode_lock); + return res; + } + } else { + counter = max_reserved + 1; + } + goto retry; + +} + +EXPORT_SYMBOL(iunique); + +struct inode *igrab(struct inode *inode) +{ + spin_lock(&inode_lock); + if (!(inode->i_state & I_FREEING)) + __iget(inode); + else + /* + * Handle the case where s_op->clear_inode is not been + * called yet, and somebody is calling igrab + * while the inode is getting freed. + */ + inode = NULL; + spin_unlock(&inode_lock); + return inode; +} + +EXPORT_SYMBOL(igrab); + +/** + * ifind - internal function, you want ilookup5() or iget5(). + * @sb: super block of file system to search + * @head: the head of the list to search + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ifind() searches for the inode specified by @data in the inode + * cache. This is a generalized version of ifind_fast() for file systems where + * the inode number is not sufficient for unique identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +static inline struct inode *ifind(struct super_block *sb, + struct hlist_head *head, int (*test)(struct inode *, void *), + void *data) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode(sb, head, test, data); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ifind_fast - internal function, you want ilookup() or iget(). + * @sb: super block of file system to search + * @head: head of the list to search + * @ino: inode number to search for + * + * ifind_fast() searches for the inode @ino in the inode cache. This is for + * file systems where the inode number is sufficient for unique identification + * of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +static inline struct inode *ifind_fast(struct super_block *sb, + struct hlist_head *head, unsigned long ino) +{ + struct inode *inode; + + spin_lock(&inode_lock); + inode = find_inode_fast(sb, head, ino); + if (inode) { + __iget(inode); + spin_unlock(&inode_lock); + wait_on_inode(inode); + return inode; + } + spin_unlock(&inode_lock); + return NULL; +} + +/** + * ilookup5 - search for an inode in the inode cache + * @sb: super block of file system to search + * @hashval: hash value (usually inode number) to search for + * @test: callback used for comparisons between inodes + * @data: opaque data pointer to pass to @test + * + * ilookup5() uses ifind() to search for the inode specified by @hashval and + * @data in the inode cache. This is a generalized version of ilookup() for + * file systems where the inode number is not sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + * + * Note, @test is called with the inode_lock held, so can't sleep. + */ +struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + + return ifind(sb, head, test, data); +} + +EXPORT_SYMBOL(ilookup5); + +/** + * ilookup - search for an inode in the inode cache + * @sb: super block of file system to search + * @ino: inode number to search for + * + * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. + * This is for file systems where the inode number is sufficient for unique + * identification of an inode. + * + * If the inode is in the cache, the inode is returned with an incremented + * reference count. + * + * Otherwise NULL is returned. + */ +struct inode *ilookup(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + + return ifind_fast(sb, head, ino); +} + +EXPORT_SYMBOL(ilookup); + +/** + * iget5_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @hashval: hash value (usually inode number) to get + * @test: callback used for comparisons between inodes + * @set: callback used to initialize a new struct inode + * @data: opaque data pointer to pass to @test and @set + * + * This is iget() without the read_inode() portion of get_new_inode(). + * + * iget5_locked() uses ifind() to search for the inode specified by @hashval + * and @data in the inode cache and if present it is returned with an increased + * reference count. This is a generalized version of iget_locked() for file + * systems where the inode number is not sufficient for unique identification + * of an inode. + * + * If the inode is not in cache, get_new_inode() is called to allocate a new + * inode and this is returned locked, hashed, and with the I_NEW flag set. The + * file system gets to fill it in before unlocking it via unlock_new_inode(). + * + * Note both @test and @set are called with the inode_lock held, so can't sleep. + */ +struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), + int (*set)(struct inode *, void *), void *data) +{ + struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct inode *inode; + + inode = ifind(sb, head, test, data); + if (inode) + return inode; + /* + * get_new_inode() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode(sb, head, test, set, data); +} + +EXPORT_SYMBOL(iget5_locked); + +/** + * iget_locked - obtain an inode from a mounted file system + * @sb: super block of file system + * @ino: inode number to get + * + * This is iget() without the read_inode() portion of get_new_inode_fast(). + * + * iget_locked() uses ifind_fast() to search for the inode specified by @ino in + * the inode cache and if present it is returned with an increased reference + * count. This is for file systems where the inode number is sufficient for + * unique identification of an inode. + * + * If the inode is not in cache, get_new_inode_fast() is called to allocate a + * new inode and this is returned locked, hashed, and with the I_NEW flag set. + * The file system gets to fill it in before unlocking it via + * unlock_new_inode(). + */ +struct inode *iget_locked(struct super_block *sb, unsigned long ino) +{ + struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct inode *inode; + + inode = ifind_fast(sb, head, ino); + if (inode) + return inode; + /* + * get_new_inode_fast() will do the right thing, re-trying the search + * in case it had to block at any point. + */ + return get_new_inode_fast(sb, head, ino); +} + +EXPORT_SYMBOL(iget_locked); + +/** + * __insert_inode_hash - hash an inode + * @inode: unhashed inode + * @hashval: unsigned long value used to locate this object in the + * inode_hashtable. + * + * Add an inode to the inode hash for this superblock. + */ +void __insert_inode_hash(struct inode *inode, unsigned long hashval) +{ + struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); + spin_lock(&inode_lock); + hlist_add_head(&inode->i_hash, head); + spin_unlock(&inode_lock); +} + +EXPORT_SYMBOL(__insert_inode_hash); + +/** + * remove_inode_hash - remove an inode from the hash + * @inode: inode to unhash + * + * Remove an inode from the superblock. + */ +void remove_inode_hash(struct inode *inode) +{ + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); +} + +EXPORT_SYMBOL(remove_inode_hash); + +/* + * Tell the filesystem that this inode is no longer of any interest and should + * be completely destroyed. + * + * We leave the inode in the inode hash table until *after* the filesystem's + * ->delete_inode completes. This ensures that an iget (such as nfsd might + * instigate) will always find up-to-date information either in the hash or on + * disk. + * + * I_FREEING is set so that no-one will take a new reference to the inode while + * it is being deleted. + */ +void generic_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + + security_inode_delete(inode); + + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* s_op->delete_inode internally recalls clear_inode() */ + delete(inode); + } else + clear_inode(inode); + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); + destroy_inode(inode); +} + +EXPORT_SYMBOL(generic_delete_inode); + +static void generic_forget_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + list_add(&inode->i_list, &inode_unused); + } + inodes_stat.nr_unused++; + spin_unlock(&inode_lock); + if (!sb || (sb->s_flags & MS_ACTIVE)) + return; + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); + } + list_del_init(&inode->i_list); + inode->i_state|=I_FREEING; + inodes_stat.nr_inodes--; + spin_unlock(&inode_lock); + if (inode->i_data.nrpages) + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + destroy_inode(inode); +} + +/* + * Normal UNIX filesystem behaviour: delete the + * inode when the usage count drops to zero, and + * i_nlink is zero. + */ +static void generic_drop_inode(struct inode *inode) +{ + if (!inode->i_nlink) + generic_delete_inode(inode); + else + generic_forget_inode(inode); +} + +/* + * Called when we're dropping the last reference + * to an inode. + * + * Call the FS "drop()" function, defaulting to + * the legacy UNIX filesystem behaviour.. + * + * NOTE! NOTE! NOTE! We're called with the inode lock + * held, and the drop function is supposed to release + * the lock! + */ +static inline void iput_final(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + void (*drop)(struct inode *) = generic_drop_inode; + + if (op && op->drop_inode) + drop = op->drop_inode; + drop(inode); +} + +/** + * iput - put an inode + * @inode: inode to put + * + * Puts an inode, dropping its usage count. If the inode use count hits + * zero the inode is also then freed and may be destroyed. + */ +void iput(struct inode *inode) +{ + if (inode) { + struct super_operations *op = inode->i_sb->s_op; + + if (inode->i_state == I_CLEAR) + BUG(); + + if (op && op->put_inode) + op->put_inode(inode); + + if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) + iput_final(inode); + } +} + +EXPORT_SYMBOL(iput); + +/** + * bmap - find a block number in a file + * @inode: inode of file + * @block: block to find + * + * Returns the block number on the device holding the inode that + * is the disk block number for the block of the file requested. + * That is, asked for block 4 of inode 1 the function will return the + * disk block relative to the disk start that holds that block of the + * file. + */ +sector_t bmap(struct inode * inode, sector_t block) +{ + sector_t res = 0; + if (inode->i_mapping->a_ops->bmap) + res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); + return res; +} + +EXPORT_SYMBOL(bmap); + +/* + * Return true if the filesystem which backs this inode considers the two + * passed timespecs to be sufficiently different to warrant flushing the + * altered time out to disk. + */ +static int inode_times_differ(struct inode *inode, + struct timespec *old, struct timespec *new) +{ + if (IS_ONE_SECOND(inode)) + return old->tv_sec != new->tv_sec; + return !timespec_equal(old, new); +} + +/** + * update_atime - update the access time + * @inode: inode accessed + * + * Update the accessed time on an inode and mark it for writeback. + * This function automatically handles read only file systems and media, + * as well as the "noatime" flag and inode specific "noatime" markers. + */ +void update_atime(struct inode *inode) +{ + struct timespec now; + + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + if (inode_times_differ(inode, &inode->i_atime, &now)) { + inode->i_atime = now; + mark_inode_dirty_sync(inode); + } else { + if (!timespec_equal(&inode->i_atime, &now)) + inode->i_atime = now; + } +} + +EXPORT_SYMBOL(update_atime); + +/** + * inode_update_time - update mtime and ctime time + * @inode: inode accessed + * @ctime_too: update ctime too + * + * Update the mtime time on an inode and mark it for writeback. + * When ctime_too is specified update the ctime too. + */ + +void inode_update_time(struct inode *inode, int ctime_too) +{ + struct timespec now; + int sync_it = 0; + + if (IS_RDONLY(inode)) + return; + + now = current_kernel_time(); + + if (inode_times_differ(inode, &inode->i_mtime, &now)) + sync_it = 1; + inode->i_mtime = now; + + if (ctime_too) { + if (inode_times_differ(inode, &inode->i_ctime, &now)) + sync_it = 1; + inode->i_ctime = now; + } + if (sync_it) + mark_inode_dirty_sync(inode); +} + +EXPORT_SYMBOL(inode_update_time); + +int inode_needs_sync(struct inode *inode) +{ + if (IS_SYNC(inode)) + return 1; + if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) + return 1; + return 0; +} + +EXPORT_SYMBOL(inode_needs_sync); + +/* + * Quota functions that want to walk the inode lists.. + */ +#ifdef CONFIG_QUOTA + +/* Functions back in dquot.c */ +void put_dquot_list(struct list_head *); +int remove_inode_dquot_ref(struct inode *, int, struct list_head *); + +void remove_dquot_ref(struct super_block *sb, int type) +{ + struct inode *inode; + struct list_head *act_head; + LIST_HEAD(tofree_head); + + if (!sb->dq_op) + return; /* nothing to do */ + spin_lock(&inode_lock); /* This lock is for inodes code */ + /* We don't have to lock against quota code - test IS_QUOTAINIT is just for speedup... */ + + list_for_each(act_head, &inode_in_use) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &inode_unused) { + inode = list_entry(act_head, struct inode, i_list); + if (inode->i_sb == sb && IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_dirty) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + list_for_each(act_head, &sb->s_io) { + inode = list_entry(act_head, struct inode, i_list); + if (IS_QUOTAINIT(inode)) + remove_inode_dquot_ref(inode, type, &tofree_head); + } + spin_unlock(&inode_lock); + + put_dquot_list(&tofree_head); +} + +#endif + +/* + * Hashed waitqueues for wait_on_inode(). The table is pretty small - the + * kernel doesn't lock many inodes at the same time. + */ +#define I_WAIT_TABLE_ORDER 3 +static struct i_wait_queue_head { + wait_queue_head_t wqh; +} ____cacheline_aligned_in_smp i_wait_queue_heads[1<<I_WAIT_TABLE_ORDER]; + +/* + * Return the address of the waitqueue_head to be used for this inode + */ +static wait_queue_head_t *i_waitq_head(struct inode *inode) +{ + return &i_wait_queue_heads[hash_ptr(inode, I_WAIT_TABLE_ORDER)].wqh; +} + +void __wait_on_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); +repeat: + set_current_state(TASK_UNINTERRUPTIBLE); + if (inode->i_state & I_LOCK) { + schedule(); + goto repeat; + } + remove_wait_queue(wq, &wait); + __set_current_state(TASK_RUNNING); +} + +/* + * If we try to find an inode in the inode hash while it is being deleted, we + * have to wait until the filesystem completes its deletion before reporting + * that it isn't found. This is because iget will immediately call + * ->read_inode, and we want to be sure that evidence of the deletion is found + * by ->read_inode. + * + * This call might return early if an inode which shares the waitq is woken up. + * This is most easily handled by the caller which will loop around again + * looking for the inode. + * + * This is called with inode_lock held. + */ +static void __wait_on_freeing_inode(struct inode *inode) +{ + DECLARE_WAITQUEUE(wait, current); + wait_queue_head_t *wq = i_waitq_head(inode); + + add_wait_queue(wq, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&inode_lock); + schedule(); + remove_wait_queue(wq, &wait); + spin_lock(&inode_lock); +} + +void wake_up_inode(struct inode *inode) +{ + wait_queue_head_t *wq = i_waitq_head(inode); + + /* + * Prevent speculative execution through spin_unlock(&inode_lock); + */ + smp_mb(); + if (waitqueue_active(wq)) + wake_up_all(wq); +} + +/* + * Initialize the waitqueues and inode hash table. + */ +void __init inode_init(unsigned long mempages) +{ + struct hlist_head *head; + unsigned long order; + unsigned int nr_hash; + int i; + + for (i = 0; i < ARRAY_SIZE(i_wait_queue_heads); i++) + init_waitqueue_head(&i_wait_queue_heads[i].wqh); + + mempages >>= (14 - PAGE_SHIFT); + mempages *= sizeof(struct hlist_head); + for (order = 0; ((1UL << order) << PAGE_SHIFT) < mempages; order++) + ; + + do { + unsigned long tmp; + + nr_hash = (1UL << order) * PAGE_SIZE / + sizeof(struct hlist_head); + i_hash_mask = (nr_hash - 1); + + tmp = nr_hash; + i_hash_shift = 0; + while ((tmp >>= 1UL) != 0UL) + i_hash_shift++; + + inode_hashtable = (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, order); + } while (inode_hashtable == NULL && --order >= 0); + + printk("Inode-cache hash table entries: %d (order: %ld, %ld bytes)\n", + nr_hash, order, (PAGE_SIZE << order)); + + if (!inode_hashtable) + panic("Failed to allocate inode hash table\n"); + + head = inode_hashtable; + i = nr_hash; + do { + INIT_HLIST_HEAD(head); + head++; + i--; + } while (i); + + /* inode slab cache */ + inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), + 0, SLAB_HWCACHE_ALIGN, init_once, + NULL); + if (!inode_cachep) + panic("cannot create inode slab cache"); + + set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); +} + +void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) +{ + inode->i_mode = mode; + if (S_ISCHR(mode)) { + inode->i_fop = &def_chr_fops; + inode->i_rdev = rdev; + } else if (S_ISBLK(mode)) { + inode->i_fop = &def_blk_fops; + inode->i_rdev = rdev; + } else if (S_ISFIFO(mode)) + inode->i_fop = &def_fifo_fops; + else if (S_ISSOCK(mode)) + inode->i_fop = &bad_sock_fops; + else + printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", + mode); +} + +EXPORT_SYMBOL(init_special_inode); diff -Nru a/fs/jbd/transaction.c b/fs/jbd/transaction.c --- a/fs/jbd/transaction.c Fri Oct 31 14:10:53 2003 +++ b/fs/jbd/transaction.c Fri Oct 31 14:10:53 2003 @@ -108,6 +108,7 @@ jbd_debug(3, "New handle %p going live.\n", handle); + handle->h_journal = journal; repeat: /* @@ -147,10 +148,13 @@ * lock to be released. */ if (transaction->t_state == T_LOCKED) { + DEFINE_WAIT(wait); + + prepare_to_wait(&journal->j_wait_transaction_locked, + &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&journal->j_state_lock); - jbd_debug(3, "Handle %p stalling...\n", handle); - wait_event(journal->j_wait_transaction_locked, - transaction->t_state != T_LOCKED); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; } @@ -245,6 +249,23 @@ return handle; } +/* + * push @handle into ->fs_context stack + */ +static void push_handle(handle_t *handle) +{ + handle->h_parent = current->fs_context; + current->fs_context = (struct fs_activation *) handle; +} + +/* + * pop top of ->fs_context stack + */ +static void pop_handle(handle_t *handle) +{ + current->fs_context = (struct fs_activation *) handle->h_parent; +} + /** * handle_t *journal_start() - Obtain a new handle. * @journal: Journal to start transaction on. @@ -267,7 +288,7 @@ if (!journal) return ERR_PTR(-EROFS); - if (handle) { + if (handle && handle->h_journal == journal) { J_ASSERT(handle->h_transaction->t_journal == journal); handle->h_ref++; return handle; @@ -277,12 +298,13 @@ if (!handle) return ERR_PTR(-ENOMEM); - current->journal_info = handle; + push_handle(handle); err = start_this_handle(journal, handle); if (err < 0) { + kfree(handle); + pop_handle(handle); jbd_free_handle(handle); - current->journal_info = NULL; handle = ERR_PTR(err); } return handle; @@ -1357,7 +1379,7 @@ } while (old_handle_count != transaction->t_handle_count); } - current->journal_info = NULL; + pop_handle(handle); spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); transaction->t_outstanding_credits -= handle->h_buffer_credits; diff -Nru a/fs/jbd/transaction.c~fs_activation.diff b/fs/jbd/transaction.c~fs_activation.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/fs/jbd/transaction.c~fs_activation.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,2073 @@ +/* + * linux/fs/transaction.c + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem transaction handling code; part of the ext2fs + * journaling system. + * + * This file manages transactions (compound commits managed by the + * journaling code) and handles (individual atomic operations by the + * filesystem). + */ + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/jbd.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/timer.h> +#include <linux/smp_lock.h> +#include <linux/mm.h> +#include <linux/highmem.h> + +/* + * get_transaction: obtain a new transaction_t object. + * + * Simply allocate and initialise a new transaction. Create it in + * RUNNING state and add it to the current journal (which should not + * have an existing running transaction: we only make a new transaction + * once we have started to commit the old one). + * + * Preconditions: + * The journal MUST be locked. We don't perform atomic mallocs on the + * new transaction and we can't block without protecting against other + * processes trying to touch the journal while it is in transition. + * + * Called under j_state_lock + */ + +static transaction_t * +get_transaction(journal_t *journal, transaction_t *transaction) +{ + transaction->t_journal = journal; + transaction->t_state = T_RUNNING; + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; + INIT_LIST_HEAD(&transaction->t_jcb); + spin_lock_init(&transaction->t_handle_lock); + spin_lock_init(&transaction->t_jcb_lock); + + /* Set up the commit timer for the new transaction. */ + journal->j_commit_timer->expires = transaction->t_expires; + add_timer(journal->j_commit_timer); + + J_ASSERT(journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; + + return transaction; +} + +/* + * Handle management. + * + * A handle_t is an object which represents a single atomic update to a + * filesystem, and which tracks all of the modifications which form part + * of that one update. + */ + +/* + * start_this_handle: Given a handle, deal with any locking or stalling + * needed to make sure that there is enough journal space for the handle + * to begin. Attach the handle to a transaction and set up the + * transaction's buffer credits. + */ + +static int start_this_handle(journal_t *journal, handle_t *handle) +{ + transaction_t *transaction; + int needed; + int nblocks = handle->h_buffer_credits; + transaction_t *new_transaction = NULL; + int ret; + + if (nblocks > journal->j_max_transaction_buffers) { + printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", + current->comm, nblocks, + journal->j_max_transaction_buffers); + ret = -ENOSPC; + goto out; + } + +alloc_transaction: + if (!journal->j_running_transaction) { + new_transaction = jbd_kmalloc(sizeof(*new_transaction), + GFP_NOFS); + if (!new_transaction) { + ret = -ENOMEM; + goto out; + } + memset(new_transaction, 0, sizeof(*new_transaction)); + } + + jbd_debug(3, "New handle %p going live.\n", handle); + +repeat: + + /* + * We need to hold j_state_lock until t_updates has been incremented, + * for proper journal barrier handling + */ + spin_lock(&journal->j_state_lock); +repeat_locked: + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { + spin_unlock(&journal->j_state_lock); + ret = -EROFS; + goto out; + } + + /* Wait on the journal's transaction barrier if necessary */ + if (journal->j_barrier_count) { + spin_unlock(&journal->j_state_lock); + wait_event(journal->j_wait_transaction_locked, + journal->j_barrier_count == 0); + goto repeat; + } + + if (!journal->j_running_transaction) { + if (!new_transaction) { + spin_unlock(&journal->j_state_lock); + goto alloc_transaction; + } + get_transaction(journal, new_transaction); + new_transaction = NULL; + } + + transaction = journal->j_running_transaction; + + /* + * If the current transaction is locked down for commit, wait for the + * lock to be released. + */ + if (transaction->t_state == T_LOCKED) { + spin_unlock(&journal->j_state_lock); + jbd_debug(3, "Handle %p stalling...\n", handle); + wait_event(journal->j_wait_transaction_locked, + transaction->t_state != T_LOCKED); + goto repeat; + } + + /* + * If there is not enough space left in the log to write all potential + * buffers requested by this operation, we need to stall pending a log + * checkpoint to free some more log space. + */ + spin_lock(&transaction->t_handle_lock); + needed = transaction->t_outstanding_credits + nblocks; + + if (needed > journal->j_max_transaction_buffers) { + /* + * If the current transaction is already too large, then start + * to commit it: we can then go back and attach this handle to + * a new transaction. + */ + DEFINE_WAIT(wait); + + jbd_debug(2, "Handle %p starting new commit...\n", handle); + spin_unlock(&transaction->t_handle_lock); + prepare_to_wait(&journal->j_wait_transaction_locked, &wait, + TASK_UNINTERRUPTIBLE); + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_transaction_locked, &wait); + goto repeat; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + * + * The worst part is, any transaction currently committing can + * reduce the free space arbitrarily. Be careful to account for + * those buffers when checkpointing. + */ + + /* + * @@@ AKPM: This seems rather over-defensive. We're giving commit + * a _lot_ of headroom: 1/4 of the journal plus the size of + * the committing transaction. Really, we only need to give it + * committing_transaction->t_outstanding_credits plus "enough" for + * the log control blocks. + * Also, this test is inconsitent with the matching one in + * journal_extend(). + */ + if (__log_space_left(journal) < jbd_space_needed(journal)) { + jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); + spin_unlock(&transaction->t_handle_lock); + __log_wait_for_space(journal); + goto repeat_locked; + } + + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. */ + + handle->h_transaction = transaction; + transaction->t_outstanding_credits += nblocks; + transaction->t_updates++; + transaction->t_handle_count++; + jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", + handle, nblocks, transaction->t_outstanding_credits, + __log_space_left(journal)); + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); +out: + if (new_transaction) + kfree(new_transaction); + return 0; +} + +/* Allocate a new handle. This should probably be in a slab... */ +static handle_t *new_handle(int nblocks) +{ + handle_t *handle = jbd_alloc_handle(GFP_NOFS); + if (!handle) + return NULL; + memset(handle, 0, sizeof(*handle)); + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + INIT_LIST_HEAD(&handle->h_jcb); + + return handle; +} + +/** + * handle_t *journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. + * @nblocks: number of block buffer we might modify + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. + * + * This function is visible to journal users (like ext3fs), so is not + * called with the journal already locked. + * + * Return a pointer to a newly allocated handle, or NULL on failure + */ +handle_t *journal_start(journal_t *journal, int nblocks) +{ + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; + } + + handle = new_handle(nblocks); + if (!handle) + return ERR_PTR(-ENOMEM); + + current->journal_info = handle; + + err = start_this_handle(journal, handle); + if (err < 0) { + jbd_free_handle(handle); + current->journal_info = NULL; + handle = ERR_PTR(err); + } + return handle; +} + +/** + * int journal_extend() - extend buffer credits. + * @handle: handle to 'extend' + * @nblocks: nr blocks to try to extend by. + * + * Some transactions, such as large extends and truncates, can be done + * atomically all at once or in several stages. The operation requests + * a credit for a number of buffer modications in advance, but can + * extend its credit if it needs more. + * + * journal_extend tries to give the running handle more buffer credits. + * It does not guarantee that allocation - this is a best-effort only. + * The calling process MUST be able to deal cleanly with a failure to + * extend here. + * + * Return 0 on success, non-zero on failure. + * + * return code < 0 implies an error + * return code > 0 implies normal transaction-full status. + */ +int journal_extend(handle_t *handle, int nblocks) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int result; + int wanted; + + result = -EIO; + if (is_handle_aborted(handle)) + goto error_out; + + result = 1; + + spin_lock(&journal->j_state_lock); + + /* Don't extend a locked-down transaction! */ + if (handle->h_transaction->t_state != T_RUNNING) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction not running\n", handle, nblocks); + goto error_out; + } + + spin_lock(&transaction->t_handle_lock); + wanted = transaction->t_outstanding_credits + nblocks; + + if (wanted > journal->j_max_transaction_buffers) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction too large\n", handle, nblocks); + goto unlock; + } + + if (wanted > __log_space_left(journal)) { + jbd_debug(3, "denied handle %p %d blocks: " + "insufficient log space\n", handle, nblocks); + goto unlock; + } + + handle->h_buffer_credits += nblocks; + transaction->t_outstanding_credits += nblocks; + result = 0; + + jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); +unlock: + spin_unlock(&transaction->t_handle_lock); +error_out: + spin_unlock(&journal->j_state_lock); + return result; +} + + +/** + * int journal_restart() - restart a handle . + * @handle: handle to restart + * @nblocks: nr credits requested + * + * Restart a handle for a multi-transaction filesystem + * operation. + * + * If the journal_extend() call above fails to grant new buffer credits + * to a running handle, a call to journal_restart will commit the + * handle's transaction so far and reattach the handle to a new + * transaction capabable of guaranteeing the requested number of + * credits. + */ + +int journal_restart(handle_t *handle, int nblocks) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int ret; + + /* If we've had an abort of any type, don't even think about + * actually doing the restart! */ + if (is_handle_aborted(handle)) + return 0; + + /* + * First unlink the handle from its current transaction, and start the + * commit on that. + */ + J_ASSERT(transaction->t_updates > 0); + J_ASSERT(journal_current_handle() == handle); + + spin_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + + if (!transaction->t_updates) + wake_up(&journal->j_wait_updates); + spin_unlock(&transaction->t_handle_lock); + + jbd_debug(2, "restarting handle %p\n", handle); + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + + handle->h_buffer_credits = nblocks; + ret = start_this_handle(journal, handle); + return ret; +} + + +/** + * void journal_lock_updates () - establish a transaction barrier. + * @journal: Journal to establish a barrier on. + * + * This locks out any further updates from being started, and blocks + * until all existing updates have completed, returning only once the + * journal is in a quiescent state with no updates running. + * + * The journal lock should not be held on entry. + */ +void journal_lock_updates(journal_t *journal) +{ + DEFINE_WAIT(wait); + + spin_lock(&journal->j_state_lock); + ++journal->j_barrier_count; + + /* Wait until there are no running updates */ + while (1) { + transaction_t *transaction = journal->j_running_transaction; + + if (!transaction) + break; + + spin_lock(&transaction->t_handle_lock); + if (!transaction->t_updates) { + spin_unlock(&transaction->t_handle_lock); + break; + } + prepare_to_wait(&journal->j_wait_updates, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + schedule(); + finish_wait(&journal->j_wait_updates, &wait); + spin_lock(&journal->j_state_lock); + } + spin_unlock(&journal->j_state_lock); + + /* + * We have now established a barrier against other normal updates, but + * we also need to barrier against other journal_lock_updates() calls + * to make sure that we serialise special journal-locked operations + * too. + */ + down(&journal->j_barrier); +} + +/** + * void journal_unlock_updates (journal_t* journal) - release barrier + * @journal: Journal to release the barrier on. + * + * Release a transaction barrier obtained with journal_lock_updates(). + * + * Should be called without the journal lock held. + */ +void journal_unlock_updates (journal_t *journal) +{ + J_ASSERT(journal->j_barrier_count != 0); + + up(&journal->j_barrier); + spin_lock(&journal->j_state_lock); + --journal->j_barrier_count; + spin_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_transaction_locked); +} + +/* + * Report any unexpected dirty buffers which turn up. Normally those + * indicate an error, but they can occur if the user is running (say) + * tune2fs to modify the live filesystem, so we need the option of + * continuing as gracefully as possible. # + * + * The caller should already hold the journal lock and + * j_list_lock spinlock: most callers will need those anyway + * in order to probe the buffer's journaling state safely. + */ +static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + int jlist; + + if (buffer_dirty(bh)) { + /* If this buffer is one which might reasonably be dirty + * --- ie. data, or not part of this journal --- then + * we're OK to leave it alone, but otherwise we need to + * move the dirty bit to the journal's own internal + * JBDDirty bit. */ + jlist = jh->b_jlist; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (test_clear_buffer_dirty(jh2bh(jh))) { + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); + } + } + } +} + +/* + * If the buffer is already part of the current transaction, then there + * is nothing we need to do. If it is already part of a prior + * transaction which we are still committing to disk, then we need to + * make sure that we do not overwrite the old copy: we do copy-out to + * preserve the copy going to disk. We also account the buffer against + * the handle's metadata buffer credits (unless the buffer is already + * part of the transaction, that is). + * + */ + +static int +do_get_write_access(handle_t *handle, struct journal_head *jh, + int force_copy, int *credits) +{ + struct buffer_head *bh; + transaction_t *transaction; + journal_t *journal; + int error; + char *frozen_buffer = NULL; + int need_copy = 0; + + if (is_handle_aborted(handle)) + return -EROFS; + + transaction = handle->h_transaction; + journal = transaction->t_journal; + + jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + + JBUFFER_TRACE(jh, "entry"); +repeat: + bh = jh2bh(jh); + + /* @@@ Need to check for errors here at some point. */ + + lock_buffer(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + /* We now hold the buffer lock so it is safe to query the buffer + * state. Is the buffer dirty? + * + * If so, there are two possibilities. The buffer may be + * non-journaled, and undergoing a quite legitimate writeback. + * Otherwise, it is journaled, and we don't expect dirty buffers + * in that state (the buffers should be marked JBD_Dirty + * instead.) So either the IO is being done under our own + * control and this is a bug, or it's a third party IO such as + * dump(8) (which may leave the buffer scheduled for read --- + * ie. locked but not dirty) or tune2fs (which may actually have + * the buffer dirtied, ugh.) */ + + if (buffer_dirty(bh)) { + /* First question: is this buffer already part of the + * current transaction or the existing committing + * transaction? */ + if (jh->b_transaction) { + J_ASSERT_JH(jh, + jh->b_transaction == transaction || + jh->b_transaction == + journal->j_committing_transaction); + if (jh->b_next_transaction) + J_ASSERT_JH(jh, jh->b_next_transaction == + transaction); + JBUFFER_TRACE(jh, "Unexpected dirty buffer"); + jbd_unexpected_dirty_buffer(jh); + } + } + + unlock_buffer(bh); + + error = -EROFS; + if (is_handle_aborted(handle)) { + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + goto out_unlocked; + } + error = 0; + + /* The buffer is already part of this transaction if + * b_transaction or b_next_transaction points to it. */ + + if (jh->b_transaction == transaction || + jh->b_next_transaction == transaction) + goto done_locked; + + /* If there is already a copy-out version of this buffer, then + * we don't need to make another one. */ + + if (jh->b_frozen_data) { + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + jh->b_next_transaction = transaction; + + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + if (credits) + (*credits)++; + goto done_locked; + } + + /* Is there data here we need to preserve? */ + + if (jh->b_transaction && jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* There is one case we have to be very careful about. + * If the committing transaction is currently writing + * this buffer out to disk and has NOT made a copy-out, + * then we cannot modify the buffer contents at all + * right now. The essence of copy-out is that it is the + * extra copy, not the primary copy, which gets + * journaled. If the primary copy is already going to + * disk then we cannot do copy-out here. */ + + if (jh->b_jlist == BJ_Shadow) { + wait_queue_head_t *wqh; + + JBUFFER_TRACE(jh, "on shadow: sleep"); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + /* commit wakes up all shadow buffers after IO */ + wqh = bh_waitq_head(jh2bh(jh)); + wait_event(*wqh, (jh->b_jlist != BJ_Shadow)); + goto repeat; + } + + /* Only do the copy if the currently-owning transaction + * still needs it. If it is on the Forget list, the + * committing transaction is past that stage. The + * buffer had better remain locked during the kmalloc, + * but that should be true --- we hold the journal lock + * still and the buffer is already on the BUF_JOURNAL + * list so won't be flushed. + * + * Subtle point, though: if this is a get_undo_access, + * then we will be relying on the frozen_data to contain + * the new value of the committed_data record after the + * transaction, so we HAVE to force the frozen_data copy + * in that case. */ + + if (jh->b_jlist != BJ_Forget || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, + GFP_NOFS); + if (!frozen_buffer) { + printk(KERN_EMERG + "%s: OOM for frozen_buffer\n", + __FUNCTION__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + goto done_locked; + } + goto repeat; + } + + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + need_copy = 1; + } + jh->b_next_transaction = transaction; + } + + J_ASSERT(handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + if (credits) + (*credits)++; + + /* Finally, if the buffer is not journaled right now, we need to + * make sure it doesn't get written to disk before the caller + * actually commits the new data. */ + + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + jh->b_transaction = transaction; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + } + +done_locked: + spin_unlock(&journal->j_list_lock); + if (need_copy) { + struct page *page; + int offset; + char *source; + + J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), + "Possible IO failure.\n"); + page = jh2bh(jh)->b_page; + offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; + source = kmap_atomic(page, KM_USER0); + memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); + kunmap_atomic(source, KM_USER0); + } + jbd_unlock_bh_state(bh); + + /* If we are about to journal a buffer, then any revoke pending + on it is no longer valid. */ + journal_cancel_revoke(handle, jh); + +out_unlocked: + if (frozen_buffer) + kfree(frozen_buffer); + + JBUFFER_TRACE(jh, "exit"); + return error; +} + +/** + * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. + * @handle: transaction to add buffer modifications to + * @bh: bh to be used for metadata writes + * + * Returns an error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're write()ing a buffer which is also part of a shared mapping. + */ + +int journal_get_write_access(handle_t *handle, + struct buffer_head *bh, int *credits) +{ + struct journal_head *jh = journal_add_journal_head(bh); + int rc; + + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ + rc = do_get_write_access(handle, jh, 0, credits); + journal_put_journal_head(jh); + return rc; +} + + +/* + * When the user wants to journal a newly created buffer_head + * (ie. getblk() returned a new buffer and we are going to populate it + * manually rather than reading off disk), then we need to keep the + * buffer_head locked until it has been completely filled with new + * data. In this case, we should be able to make the assertion that + * the bh is not already part of an existing transaction. + * + * The buffer should already be locked by the caller by this point. + * There is no lock ranking violation: it was a newly created, + * unlocked buffer beforehand. */ + +/** + * int journal_get_create_access () - notify intent to use newly created bh + * @handle: transaction to new buffer to + * @bh: new buffer. + * + * Call this if you create a new bh. + */ +int journal_get_create_access(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = journal_add_journal_head(bh); + int err; + + jbd_debug(5, "journal_head %p\n", jh); + err = -EROFS; + if (is_handle_aborted(handle)) + goto out; + err = 0; + + JBUFFER_TRACE(jh, "entry"); + /* + * The buffer may already belong to this transaction due to pre-zeroing + * in the filesystem's new_block code. It may also be on the previous, + * committing transaction's lists, but it HAS to be in Forget state in + * that case: the transaction must have deleted the buffer for it to be + * reused here. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget))); + + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + + if (jh->b_transaction == NULL) { + jh->b_transaction = transaction; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + } else if (jh->b_transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "set next transaction"); + jh->b_next_transaction = transaction; + } + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect + * blocks which contain freed but then revoked metadata. We need + * to cancel the revoke in case we end up freeing it yet again + * and the reallocating as data - this would cause a second revoke, + * which hits an assertion error. + */ + JBUFFER_TRACE(jh, "cancelling revoke"); + journal_cancel_revoke(handle, jh); + journal_put_journal_head(jh); +out: + return err; +} + +/** + * int journal_get_undo_access() - Notify intent to modify metadata with + * non-rewindable consequences + * @handle: transaction + * @bh: buffer to undo + * @credits: store the number of taken credits here (if not NULL) + * + * Sometimes there is a need to distinguish between metadata which has + * been committed to disk and that which has not. The ext3fs code uses + * this for freeing and allocating space, we have to make sure that we + * do not reuse freed space until the deallocation has been committed, + * since if we overwrote that space we would make the delete + * un-rewindable in case of a crash. + * + * To deal with that, journal_get_undo_access requests write access to a + * buffer for parts of non-rewindable operations such as delete + * operations on the bitmaps. The journaling code must keep a copy of + * the buffer's contents prior to the undo_access call until such time + * as we know that the buffer has definitely been committed to disk. + * + * We never need to know which transaction the committed data is part + * of, buffers touched here are guaranteed to be dirtied later and so + * will be committed to a new transaction in due course, at which point + * we can discard the old committed data pointer. + * + * Returns error number or 0 on success. + */ +int journal_get_undo_access(handle_t *handle, struct buffer_head *bh, + int *credits) +{ + int err; + struct journal_head *jh = journal_add_journal_head(bh); + char *committed_data = NULL; + + JBUFFER_TRACE(jh, "entry"); + + /* + * Do this first --- it can drop the journal lock, so we want to + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. + */ + err = do_get_write_access(handle, jh, 1, credits); + if (err) + goto out; + +repeat: + if (!jh->b_committed_data) { + committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); + if (!committed_data) { + printk(KERN_EMERG "%s: No memory for committed data\n", + __FUNCTION__); + err = -ENOMEM; + goto out; + } + } + + jbd_lock_bh_state(bh); + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + if (!committed_data) { + jbd_unlock_bh_state(bh); + goto repeat; + } + + jh->b_committed_data = committed_data; + committed_data = NULL; + memcpy(jh->b_committed_data, bh->b_data, bh->b_size); + } + jbd_unlock_bh_state(bh); +out: + journal_put_journal_head(jh); + if (committed_data) + kfree(committed_data); + return err; +} + +/** + * int journal_dirty_data() - mark a buffer as containing dirty data which + * needs to be flushed before we can commit the + * current transaction. + * @handle: transaction + * @bh: bufferhead to mark + * + * The buffer is placed on the transaction's data list and is marked as + * belonging to the transaction. + * + * Returns error number or 0 on success. + * + * journal_dirty_data() can be called via page_launder->ext3_writepage + * by kswapd. + */ +int journal_dirty_data(handle_t *handle, struct buffer_head *bh) +{ + journal_t *journal = handle->h_transaction->t_journal; + int need_brelse = 0; + struct journal_head *jh; + + if (is_handle_aborted(handle)) + return 0; + + jh = journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + + /* + * The buffer could *already* be dirty. Writeout can start + * at any time. + */ + jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); + + /* + * What if the buffer is already part of a running transaction? + * + * There are two cases: + * 1) It is part of the current running transaction. Refile it, + * just in case we have allocated it as metadata, deallocated + * it, then reallocated it as data. + * 2) It is part of the previous, still-committing transaction. + * If all we want to do is to guarantee that the buffer will be + * written to disk before this new transaction commits, then + * being sure that the *previous* transaction has this same + * property is sufficient for us! Just leave it on its old + * transaction. + * + * In case (2), the buffer must not already exist as metadata + * --- that would violate write ordering (a transaction is free + * to write its data at any point, even before the previous + * committing transaction has committed). The caller must + * never, ever allow this to happen: there's nothing we can do + * about it in this layer. + */ + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + if (jh->b_transaction) { + JBUFFER_TRACE(jh, "has transaction"); + if (jh->b_transaction != handle->h_transaction) { + JBUFFER_TRACE(jh, "belongs to older transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* @@@ IS THIS TRUE ? */ + /* + * Not any more. Scenario: someone does a write() + * in data=journal mode. The buffer's transaction has + * moved into commit. Then someone does another + * write() to the file. We do the frozen data copyout + * and set b_next_transaction to point to j_running_t. + * And while we're in that state, someone does a + * writepage() in an attempt to pageout the same area + * of the file via a shared mapping. At present that + * calls journal_dirty_data(), and we get right here. + * It may be too late to journal the data. Simply + * falling through to the next test will suffice: the + * data will be dirty and wil be checkpointed. The + * ordering comments in the next comment block still + * apply. + */ + //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + + /* + * If we're journalling data, and this buffer was + * subject to a write(), it could be metadata, forget + * or shadow against the committing transaction. Now, + * someone has dirtied the same darn page via a mapping + * and it is being writepage()'d. + * We *could* just steal the page from commit, with some + * fancy locking there. Instead, we just skip it - + * don't tie the page's buffers to the new transaction + * at all. + * Implication: if we crash before the writepage() data + * is written into the filesystem, recovery will replay + * the write() data. + */ + if (jh->b_jlist != BJ_None && + jh->b_jlist != BJ_SyncData) { + JBUFFER_TRACE(jh, "Not stealing"); + goto no_journal; + } + + /* + * This buffer may be undergoing writeout in commit. We + * can't return from here and let the caller dirty it + * again because that can cause the write-out loop in + * commit to never terminate. + */ + if (buffer_dirty(bh)) { + get_bh(bh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + need_brelse = 1; + sync_dirty_buffer(bh); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + /* The buffer may become locked again at any + time if it is redirtied */ + } + + /* journal_clean_data_list() may have got there first */ + if (jh->b_transaction != NULL) { + JBUFFER_TRACE(jh, "unfile from commit"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + } + /* The buffer will be refiled below */ + + } + /* + * Special case --- the buffer might actually have been + * allocated and then immediately deallocated in the previous, + * committing transaction, so might still be left on that + * transaction's metadata lists. + */ + if (jh->b_jlist != BJ_SyncData) { + JBUFFER_TRACE(jh, "not on correct data list: unfile"); + J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + JBUFFER_TRACE(jh, "file as data"); + __journal_file_buffer(jh, handle->h_transaction, + BJ_SyncData); + } + } else { + JBUFFER_TRACE(jh, "not on a transaction"); + __journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); + } +no_journal: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + if (need_brelse) { + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + JBUFFER_TRACE(jh, "exit"); + journal_put_journal_head(jh); + return 0; +} + +/** + * int journal_dirty_metadata() - mark a buffer as containing dirty metadata + * @handle: transaction to add buffer to. + * @bh: buffer to mark + * + * mark dirty metadata which needs to be journaled as part of the current + * transaction. + * + * The buffer is placed on the transaction's metadata list and is marked + * as belonging to the transaction. + * + * Returns error number or 0 on success. + * + * Special care needs to be taken if the buffer already belongs to the + * current committing transaction (in which case we should have frozen + * data present for that commit). In that case, we don't relink the + * buffer: that only gets done when the old transaction finally + * completes its commit. + */ +int journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = bh2jh(bh); + + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + if (is_handle_aborted(handle)) + goto out; + + jbd_lock_bh_state(bh); + + /* + * fastpath, to avoid expensive locking. If this buffer is already + * on the running transaction's metadata list there is nothing to do. + * Nobody can take it off again because there is a handle open. + * I _think_ we're OK here with SMP barriers - a mistaken decision will + * result in this test being false, so we go in and take the locks. + */ + if (jh->b_transaction == handle->h_transaction && + jh->b_jlist == BJ_Metadata) { + JBUFFER_TRACE(jh, "fastpath"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_running_transaction); + goto out_unlock_bh; + } + + spin_lock(&journal->j_list_lock); + set_buffer_jbddirty(bh); + + J_ASSERT_JH(jh, jh->b_transaction != NULL); + + /* + * Metadata already on the current transaction list doesn't + * need to be filed. Metadata on another transaction's list must + * be committing, and will be refiled once the commit completes: + * leave it alone for now. + */ + + if (jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "already on other transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + /* And this case is illegal: we can't reuse another + * transaction's data buffer, ever. */ + /* FIXME: writepage() should be journalled */ + goto out_unlock_list; + } + + /* That test should have eliminated the following case: */ + J_ASSERT_JH(jh, jh->b_frozen_data == 0); + + JBUFFER_TRACE(jh, "file as BJ_Metadata"); + __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); + +out_unlock_list: + spin_unlock(&journal->j_list_lock); +out_unlock_bh: + jbd_unlock_bh_state(bh); +out: + JBUFFER_TRACE(jh, "exit"); + return 0; +} + +/* + * journal_release_buffer: undo a get_write_access without any buffer + * updates, if the update decided in the end that it didn't need access. + * + * The caller passes in the number of credits which should be put back for + * this buffer (zero or one). + * + * We leave the buffer attached to t_reserved_list because even though this + * handle doesn't want it, some other concurrent handle may want to journal + * this buffer. If that handle is curently in between get_write_access() and + * journal_dirty_metadata() then it expects the buffer to be reserved. If + * we were to rip it off t_reserved_list here, the other handle will explode + * when journal_dirty_metadata is presented with a non-reserved buffer. + * + * If nobody really wants to journal this buffer then it will be thrown + * away at the start of commit. + */ +void +journal_release_buffer(handle_t *handle, struct buffer_head *bh, int credits) +{ + BUFFER_TRACE(bh, "entry"); + handle->h_buffer_credits += credits; +} + +/** + * void journal_forget() - bforget() for potentially-journaled buffers. + * @handle: transaction handle + * @bh: bh to 'forget' + * + * We can only do the bforget if there are no commits pending against the + * buffer. If the buffer is dirty in the current running transaction we + * can safely unlink it. + * + * bh may not be a journalled buffer at all - it may be a non-JBD + * buffer which came off the hashtable. Check for this. + * + * Decrements bh->b_count by one. + * + * Allow this call even if the handle has aborted --- it may be part of + * the caller's cleanup after an abort. + */ +void journal_forget(handle_t *handle, struct buffer_head *bh) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh; + + BUFFER_TRACE(bh, "entry"); + + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + if (!buffer_jbd(bh)) + goto not_jbd; + jh = bh2jh(bh); + + if (jh->b_transaction == handle->h_transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + + /* If we are forgetting a buffer which is already part + * of this transaction, then we can just drop it from + * the transaction immediately. */ + clear_buffer_dirty(bh); + clear_buffer_jbddirty(bh); + + JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); + J_ASSERT_JH(jh, !jh->b_committed_data); + + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + + /* + * We are no longer going to journal this buffer. + * However, the commit of this transaction is still + * important to the buffer: the delete that we are now + * processing might obsolete an old log entry, so by + * committing, we can satisfy the buffer's checkpoint. + * + * So, if we have a checkpoint on the buffer, we should + * now refile the buffer on our BJ_Forget list so that + * we know to remove the checkpoint after we commit. + */ + + if (jh->b_cp_transaction) { + __journal_file_buffer(jh, transaction, BJ_Forget); + } else { + journal_remove_journal_head(bh); + __brelse(bh); + if (!buffer_jbd(bh)) { + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __bforget(bh); + return; + } + } + } else if (jh->b_transaction) { + J_ASSERT_JH(jh, (jh->b_transaction == + journal->j_committing_transaction)); + /* However, if the buffer is still owned by a prior + * (committing) transaction, we can't drop it yet... */ + JBUFFER_TRACE(jh, "belongs to older transaction"); + /* ... but we CAN drop it from the new transaction if we + * have also modified it since the original commit. */ + + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == transaction); + jh->b_next_transaction = NULL; + } + } + +not_jbd: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + __brelse(bh); + return; +} + +/** + * void journal_callback_set() - Register a callback function for this handle. + * @handle: handle to attach the callback to. + * @func: function to callback. + * @jcb: structure with additional information required by func() , and + * some space for jbd internal information. + * + * The function will be + * called when the transaction that this handle is part of has been + * committed to disk with the original callback data struct and the + * error status of the journal as parameters. There is no guarantee of + * ordering between handles within a single transaction, nor between + * callbacks registered on the same handle. + * + * The caller is responsible for allocating the journal_callback struct. + * This is to allow the caller to add as much extra data to the callback + * as needed, but reduce the overhead of multiple allocations. The caller + * allocated struct must start with a struct journal_callback at offset 0, + * and has the caller-specific data afterwards. + */ +void journal_callback_set(handle_t *handle, + void (*func)(struct journal_callback *jcb, int error), + struct journal_callback *jcb) +{ + spin_lock(&handle->h_transaction->t_jcb_lock); + list_add_tail(&jcb->jcb_list, &handle->h_jcb); + spin_unlock(&handle->h_transaction->t_jcb_lock); + jcb->jcb_func = func; +} + +/** + * int journal_stop() - complete a transaction + * @handle: tranaction to complete. + * + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining + * buffer credits to the transaction and remove the handle. The only + * complication is that we need to start a commit operation if the + * filesystem is marked for synchronous update. + * + * journal_stop itself will not usually return an error, but it may + * do so in unusual circumstances. In particular, expect it to + * return -EIO if a journal_abort has been executed since the + * transaction began. + */ +int journal_stop(handle_t *handle) +{ + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int old_handle_count, err; + + J_ASSERT(transaction->t_updates > 0); + J_ASSERT(journal_current_handle() == handle); + + if (is_handle_aborted(handle)) + err = -EIO; + else + err = 0; + + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } + + jbd_debug(4, "Handle %p going down\n", handle); + + /* + * Implement synchronous transaction batching. If the handle + * was synchronous, don't force a commit immediately. Let's + * yield and let another thread piggyback onto this transaction. + * Keep doing that while new threads continue to arrive. + * It doesn't cost much - we're about to run a commit and sleep + * on IO anyway. Speeds up many-threaded, many-dir operations + * by 30x or more... + */ + if (handle->h_sync) { + set_current_state(TASK_RUNNING); + do { + old_handle_count = transaction->t_handle_count; + schedule(); + } while (old_handle_count != transaction->t_handle_count); + } + + current->journal_info = NULL; + spin_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + if (!transaction->t_updates) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + /* Move callbacks from the handle to the transaction. */ + spin_lock(&transaction->t_jcb_lock); + list_splice(&handle->h_jcb, &transaction->t_jcb); + spin_unlock(&transaction->t_jcb_lock); + + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the + * transaction is too old now. + */ + if (handle->h_sync || + transaction->t_outstanding_credits > + journal->j_max_transaction_buffers || + time_after_eq(jiffies, transaction->t_expires)) { + /* Do this even for aborted journals: an abort still + * completes the commit thread, it just doesn't write + * anything to disk. */ + tid_t tid = transaction->t_tid; + + spin_unlock(&transaction->t_handle_lock); + jbd_debug(2, "transaction too old, requesting commit for " + "handle %p\n", handle); + /* This is non-blocking */ + __log_start_commit(journal, transaction->t_tid); + spin_unlock(&journal->j_state_lock); + + /* + * Special case: JFS_SYNC synchronous updates require us + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) + err = log_wait_commit(journal, tid); + } else { + spin_unlock(&transaction->t_handle_lock); + spin_unlock(&journal->j_state_lock); + } + + jbd_free_handle(handle); + return err; +} + +/**int journal_force_commit() - force any uncommitted transactions + * @journal: journal to force + * + * For synchronous operations: force any uncommitted transactions + * to disk. May seem kludgy, but it reuses all the handle batching + * code in a very simple manner. + */ +int journal_force_commit(journal_t *journal) +{ + handle_t *handle; + int ret; + + handle = journal_start(journal, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + } else { + handle->h_sync = 1; + ret = journal_stop(handle); + } + return ret; +} + +/* + * + * List management code snippets: various functions for manipulating the + * transaction buffer lists. + * + */ + +/* + * Append a buffer to a transaction list, given the transaction's list head + * pointer. + * + * j_list_lock is held. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_add_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (!*list) { + jh->b_tnext = jh->b_tprev = jh; + *list = jh; + } else { + /* Insert at the tail of the list to preserve order */ + struct journal_head *first = *list, *last = first->b_tprev; + jh->b_tprev = last; + jh->b_tnext = first; + last->b_tnext = first->b_tprev = jh; + } +} + +/* + * Remove a buffer from a transaction list, given the transaction's list + * head pointer. + * + * Called with j_list_lock held, and the journal may not be locked. + * + * jbd_lock_bh_state(jh2bh(jh)) is held. + */ + +static inline void +__blist_del_buffer(struct journal_head **list, struct journal_head *jh) +{ + if (*list == jh) { + *list = jh->b_tnext; + if (*list == jh) + *list = 0; + } + jh->b_tprev->b_tnext = jh->b_tnext; + jh->b_tnext->b_tprev = jh->b_tprev; +} + +/* + * Remove a buffer from the appropriate transaction list. + * + * Note that this function can *change* the value of + * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, + * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller + * is holding onto a copy of one of thee pointers, it could go bad. + * Generally the caller needs to re-read the pointer from the transaction_t. + * + * Called under j_list_lock. The journal may not be locked. + */ +void __journal_unfile_buffer(struct journal_head *jh) +{ + struct journal_head **list = 0; + transaction_t *transaction; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + transaction = jh->b_transaction; + if (transaction) + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + if (jh->b_jlist != BJ_None) + J_ASSERT_JH(jh, transaction != 0); + + switch (jh->b_jlist) { + case BJ_None: + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers--; + J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_del_buffer(list, jh); + jh->b_jlist = BJ_None; + if (test_clear_buffer_jbddirty(bh)) + mark_buffer_dirty(bh); /* Expose it to the VM */ +} + +void journal_unfile_buffer(journal_t *journal, struct journal_head *jh) +{ + jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&journal->j_list_lock); + __journal_unfile_buffer(jh); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(jh2bh(jh)); +} + +/* + * Called from journal_try_to_free_buffers(). + * + * Called under jbd_lock_bh_state(bh) + */ +static void +__journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) +{ + struct journal_head *jh; + + jh = bh2jh(bh); + + if (buffer_locked(bh) || buffer_dirty(bh)) + goto out; + + if (jh->b_next_transaction != 0) + goto out; + + spin_lock(&journal->j_list_lock); + if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { + if (jh->b_jlist == BJ_SyncData) { + /* A written-back ordered data buffer */ + JBUFFER_TRACE(jh, "release data"); + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + journal_remove_journal_head(bh); + __brelse(bh); + } + } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { + /* written-back checkpointed metadata buffer */ + if (jh->b_jlist == BJ_None) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __journal_remove_checkpoint(jh); + journal_remove_journal_head(bh); + __brelse(bh); + } + } + spin_unlock(&journal->j_list_lock); +out: + return; +} + + +/** + * int journal_try_to_free_buffers() - try to free page buffers. + * @journal: journal for operation + * @page: to try and free + * @gfp_mask: 'IO' mode for try_to_free_buffers() + * + * + * For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this if the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + * + * This complicates JBD locking somewhat. We aren't protected by the + * BKL here. We wish to remove the buffer from its committing or + * running transaction's ->t_datalist via __journal_unfile_buffer. + * + * This may *change* the value of transaction_t->t_datalist, so anyone + * who looks at t_datalist needs to lock against this function. + * + * Even worse, someone may be doing a journal_dirty_data on this + * buffer. So we need to lock against that. journal_dirty_data() + * will come out of the lock with the buffer dirty, which makes it + * ineligible for release here. + * + * Who else is affected by this? hmm... Really the only contender + * is do_get_write_access() - it could be looking at the buffer while + * journal_try_to_free_buffer() is changing its state. But that + * cannot happen because we never reallocate freed data as metadata + * while the data is part of a transaction. Yes? + */ +int journal_try_to_free_buffers(journal_t *journal, + struct page *page, int unused_gfp_mask) +{ + struct buffer_head *head; + struct buffer_head *bh; + int ret = 0; + + J_ASSERT(PageLocked(page)); + + head = page_buffers(page); + bh = head; + do { + struct journal_head *jh; + + /* + * We take our own ref against the journal_head here to avoid + * having to add tons of locking around each instance of + * journal_remove_journal_head() and journal_put_journal_head(). + */ + jh = journal_grab_journal_head(bh); + if (!jh) + continue; + + jbd_lock_bh_state(bh); + __journal_try_to_free_buffer(journal, bh); + journal_put_journal_head(jh); + jbd_unlock_bh_state(bh); + if (buffer_jbd(bh)) + goto busy; + } while ((bh = bh->b_this_page) != head); + ret = try_to_free_buffers(page); +busy: + return ret; +} + +/* + * This buffer is no longer needed. If it is on an older transaction's + * checkpoint list we need to record it on this transaction's forget list + * to pin this buffer (and hence its checkpointing transaction) down until + * this transaction commits. If the buffer isn't on a checkpoint list, we + * release it. + * Returns non-zero if JBD no longer has an interest in the buffer. + * + * Called under j_list_lock. + * + * Called under jbd_lock_bh_state(bh). + */ +static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) +{ + int may_free = 1; + struct buffer_head *bh = jh2bh(jh); + + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + + if (jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_file_buffer(jh, transaction, BJ_Forget); + clear_buffer_jbddirty(bh); + may_free = 0; + } else { + JBUFFER_TRACE(jh, "on running transaction"); + journal_remove_journal_head(bh); + __brelse(bh); + } + return may_free; +} + +/* + * journal_invalidatepage + * + * This code is tricky. It has a number of cases to deal with. + * + * There are two invariants which this code relies on: + * + * i_size must be updated on disk before we start calling invalidatepage on the + * data. + * + * This is done in ext3 by defining an ext3_setattr method which + * updates i_size before truncate gets going. By maintaining this + * invariant, we can be sure that it is safe to throw away any buffers + * attached to the current transaction: once the transaction commits, + * we know that the data will not be needed. + * + * Note however that we can *not* throw away data belonging to the + * previous, committing transaction! + * + * Any disk blocks which *are* part of the previous, committing + * transaction (and which therefore cannot be discarded immediately) are + * not going to be reused in the new running transaction + * + * The bitmap committed_data images guarantee this: any block which is + * allocated in one transaction and removed in the next will be marked + * as in-use in the committed_data bitmap, so cannot be reused until + * the next transaction to delete the block commits. This means that + * leaving committing buffers dirty is quite safe: the disk blocks + * cannot be reallocated to a different file and so buffer aliasing is + * not possible. + * + * + * The above applies mainly to ordered data mode. In writeback mode we + * don't make guarantees about the order in which data hits disk --- in + * particular we don't guarantee that new dirty data is flushed before + * transaction commit --- so it is always safe just to discard data + * immediately in that mode. --sct + */ + +/* + * The journal_unmap_buffer helper function returns zero if the buffer + * concerned remains pinned as an anonymous buffer belonging to an older + * transaction. + * + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ +static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) +{ + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; + int ret; + + BUFFER_TRACE(bh, "entry"); + + /* + * It is safe to proceed here without the j_list_lock because the + * buffers cannot be stolen by try_to_free_buffers as long as we are + * holding the page lock. --sct + */ + + if (!buffer_jbd(bh)) + goto zap_buffer_unlocked; + + spin_lock(&journal->j_state_lock); + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + /* + * Now we have the locks, check again to see whether kjournald has + * taken the buffer off the transaction. + */ + if (!buffer_jbd(bh)) + goto zap_buffer; + + jh = bh2jh(bh); + transaction = jh->b_transaction; + if (transaction == NULL) { + /* First case: not on any transaction. If it + * has no checkpoint link, then we can zap it: + * it's a writeback-mode buffer so we don't care + * if it hits disk safely. */ + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "not on any transaction: zap"); + goto zap_buffer; + } + + if (!buffer_dirty(bh)) { + /* bdflush has written it. We can drop it now */ + goto zap_buffer; + } + + /* OK, it must be in the journal but still not + * written fully to disk: it's metadata or + * journaled data... */ + + if (journal->j_running_transaction) { + /* ... and once the current transaction has + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); + ret = __dispose_buffer(jh, + journal->j_running_transaction); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + return ret; + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have + * passed into commit. We must attach this buffer to + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); + ret = __dispose_buffer(jh, + journal->j_committing_transaction); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + return ret; + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ + clear_buffer_jbddirty(bh); + goto zap_buffer; + } + } + } else if (transaction == journal->j_committing_transaction) { + /* If it is committing, we simply cannot touch it. We + * can remove it's next_transaction pointer from the + * running transaction if that is set, but nothing + * else. */ + JBUFFER_TRACE(jh, "on committing transaction"); + set_buffer_freed(bh); + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == + journal->j_running_transaction); + jh->b_next_transaction = NULL; + } + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); + return 0; + } else { + /* Good, the buffer belongs to the running transaction. + * We are writing our own transaction's data, not any + * previous one's, so it is safe to throw it away + * (remember that we expect the filesystem to have set + * i_size already for this truncate so recovery will not + * expose the disk blocks we are discarding here.) */ + J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + may_free = __dispose_buffer(jh, transaction); + } + +zap_buffer: + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + spin_unlock(&journal->j_state_lock); +zap_buffer_unlocked: + clear_buffer_dirty(bh); + J_ASSERT_BH(bh, !buffer_jbddirty(bh)); + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + bh->b_bdev = NULL; + return may_free; +} + +/** + * int journal_invalidatepage() + * @journal: journal to use for flush... + * @page: page to flush + * @offset: length of page to invalidate. + * + * Reap page buffers containing data after offset in page. + * + * Return non-zero if the page's buffers were successfully reaped. + */ +int journal_invalidatepage(journal_t *journal, + struct page *page, + unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + int may_free = 1; + + if (!PageLocked(page)) + BUG(); + if (!page_has_buffers(page)) + return 1; + + /* We will potentially be playing with lists other than just the + * data lists (especially for journaled data mode), so be + * cautious in our locking. */ + + head = bh = page_buffers(page); + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* AKPM: doing lock_buffer here may be overly paranoid */ + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); + may_free &= journal_unmap_buffer(journal, bh); + unlock_buffer(bh); + } + curr_off = next_off; + bh = next; + + } while (bh != head); + + if (!offset) { + if (!may_free || !try_to_free_buffers(page)) + return 0; + J_ASSERT(!page_has_buffers(page)); + } + return 1; +} + +/* + * File a buffer on the given transaction list. + */ +void __journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + struct journal_head **list = 0; + int was_dirty = 0; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + assert_spin_locked(&transaction->t_journal->j_list_lock); + + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == 0); + + if (jh->b_transaction && jh->b_jlist == jlist) + return; + + /* The following list of buffer states needs to be consistent + * with __jbd_unexpected_dirty_buffer()'s handling of dirty + * state. */ + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (test_clear_buffer_dirty(bh) || + test_clear_buffer_jbddirty(bh)) + was_dirty = 1; + } + + if (jh->b_transaction) + __journal_unfile_buffer(jh); + else + jh->b_transaction = transaction; + + switch (jlist) { + case BJ_None: + J_ASSERT_JH(jh, !jh->b_committed_data); + J_ASSERT_JH(jh, !jh->b_frozen_data); + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers++; + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_add_buffer(list, jh); + jh->b_jlist = jlist; + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +void journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) +{ + jbd_lock_bh_state(jh2bh(jh)); + spin_lock(&transaction->t_journal->j_list_lock); + __journal_file_buffer(jh, transaction, jlist); + spin_unlock(&transaction->t_journal->j_list_lock); + jbd_unlock_bh_state(jh2bh(jh)); +} + +/* + * Remove a buffer from its current buffer list in preparation for + * dropping it from its current transaction entirely. If the buffer has + * already started to be used by a subsequent transaction, refile the + * buffer on that transaction's metadata list. + * + * Called under journal->j_list_lock + * + * Called under jbd_lock_bh_state(jh2bh(jh)) + */ +void __journal_refile_buffer(struct journal_head *jh) +{ + int was_dirty; + struct buffer_head *bh = jh2bh(jh); + + J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh)); + if (jh->b_transaction) + assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock); + + /* If the buffer is now unused, just drop it. */ + if (jh->b_next_transaction == NULL) { + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + return; + } + + /* + * It has been modified by a later transaction: add it to the new + * transaction's metadata list. + */ + + was_dirty = test_clear_buffer_jbddirty(bh); + __journal_unfile_buffer(jh); + jh->b_transaction = jh->b_next_transaction; + jh->b_next_transaction = NULL; + __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + + if (was_dirty) + set_buffer_jbddirty(bh); +} + +/* + * For the unlocked version of this call, also make sure that any + * hanging journal_head is cleaned up if necessary. + * + * __journal_refile_buffer is usually called as part of a single locked + * operation on a buffer_head, in which the caller is probably going to + * be hooking the journal_head onto other lists. In that case it is up + * to the caller to remove the journal_head if necessary. For the + * unlocked journal_refile_buffer call, the caller isn't going to be + * doing anything else to the buffer so we need to do the cleanup + * ourselves to avoid a jh leak. + * + * *** The journal_head may be freed by this call! *** + */ +void journal_refile_buffer(journal_t *journal, struct journal_head *jh) +{ + struct buffer_head *bh = jh2bh(jh); + + jbd_lock_bh_state(bh); + spin_lock(&journal->j_list_lock); + + __journal_refile_buffer(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + + spin_unlock(&journal->j_list_lock); + __brelse(bh); +} diff -Nru a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c --- a/fs/jfs/jfs_metapage.c Fri Oct 31 14:10:53 2003 +++ b/fs/jfs/jfs_metapage.c Fri Oct 31 14:10:53 2003 @@ -511,9 +511,6 @@ if (mp) { set_bit(META_discard, &mp->flag); spin_unlock(&meta_lock); - lock_page(mp->page); - block_invalidatepage(mp->page, 0); - unlock_page(mp->page); } else { spin_unlock(&meta_lock); page = find_lock_page(mapping, lblock>>l2BlocksPerPage); diff -Nru a/fs/jfs/namei.c b/fs/jfs/namei.c --- a/fs/jfs/namei.c Fri Oct 31 14:10:54 2003 +++ b/fs/jfs/namei.c Fri Oct 31 14:10:54 2003 @@ -772,15 +772,16 @@ jfs_info("jfs_link: %s %s", old_dentry->d_name.name, dentry->d_name.name); + if (ip->i_nlink == JFS_LINK_MAX) + return -EMLINK; + + if (ip->i_nlink == 0) + return -ENOENT; + tid = txBegin(ip->i_sb, 0); down(&JFS_IP(dir)->commit_sem); down(&JFS_IP(ip)->commit_sem); - - if (ip->i_nlink == JFS_LINK_MAX) { - rc = -EMLINK; - goto out; - } /* * scan parent directory for entry/freespace diff -Nru a/include/asm-h8300/smplock.h b/include/asm-h8300/smplock.h --- a/include/asm-h8300/smplock.h Fri Oct 31 14:10:54 2003 +++ /dev/null Wed Dec 31 16:00:00 1969 @@ -1,51 +0,0 @@ -/* - * <asm/smplock.h> - * - * Default SMP lock implementation - */ -#include <linux/interrupt.h> -#include <linux/spinlock.h> - -extern spinlock_t kernel_flag; - -#define kernel_locked() spin_is_locked(&kernel_flag) - -/* - * Release global kernel lock and global interrupt lock - */ -#define release_kernel_lock(task, cpu) \ -do { \ - if (task->lock_depth >= 0) \ - spin_unlock(&kernel_flag); \ - release_irqlock(cpu); \ - __sti(); \ -} while (0) - -/* - * Re-acquire the kernel lock - */ -#define reacquire_kernel_lock(task) \ -do { \ - if (task->lock_depth >= 0) \ - spin_lock(&kernel_flag); \ -} while (0) - - -/* - * Getting the big kernel lock. - * - * This cannot happen asynchronously, - * so we only need to worry about other - * CPU's. - */ -extern __inline__ void lock_kernel(void) -{ - if (!++current->lock_depth) - spin_lock(&kernel_flag); -} - -extern __inline__ void unlock_kernel(void) -{ - if (--current->lock_depth < 0) - spin_unlock(&kernel_flag); -} diff -Nru a/include/asm-i386/spinlock.h b/include/asm-i386/spinlock.h --- a/include/asm-i386/spinlock.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-i386/spinlock.h Fri Oct 31 14:10:53 2003 @@ -4,6 +4,7 @@ #include <asm/atomic.h> #include <asm/rwlock.h> #include <asm/page.h> +#include <asm/current.h> #include <linux/config.h> #include <linux/compiler.h> @@ -18,15 +19,31 @@ volatile unsigned int lock; #ifdef CONFIG_DEBUG_SPINLOCK unsigned magic; + void *owner; #endif } spinlock_t; #define SPINLOCK_MAGIC 0xdead4ead #ifdef CONFIG_DEBUG_SPINLOCK -#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC +#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC, NULL +#include <asm/current.h> +#define SPIN_DONT_CHECK ((void *)1) +static inline void spin_lock_dont_check(spinlock_t *lock) +{ + lock->owner = SPIN_DONT_CHECK; +} + +#define spin_bug(lock) \ +({ \ + printk("spinlock bug: %p: %x, %x, %p, %p\n", \ + lock, lock->lock, lock->magic, lock->owner, get_current()); \ + BUG(); \ +}) + #else #define SPINLOCK_MAGIC_INIT /* */ +#define spin_lock_dont_check(lock) #endif #define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT } @@ -71,8 +88,15 @@ static inline void _raw_spin_unlock(spinlock_t *lock) { #ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(lock->magic != SPINLOCK_MAGIC); - BUG_ON(!spin_is_locked(lock)); + if (lock->magic != SPINLOCK_MAGIC) + spin_bug(lock); + if (lock->owner != SPIN_DONT_CHECK) { + if (!spin_is_locked(lock)) + spin_bug(lock); + if (lock->owner != get_current()) + spin_bug(lock); + lock->owner = NULL; + } #endif __asm__ __volatile__( spin_unlock_string @@ -90,8 +114,15 @@ { char oldval = 1; #ifdef CONFIG_DEBUG_SPINLOCK - BUG_ON(lock->magic != SPINLOCK_MAGIC); - BUG_ON(!spin_is_locked(lock)); + if (lock->magic != SPINLOCK_MAGIC) + spin_bug(lock); + if (lock->owner != SPIN_DONT_CHECK) { + if (!spin_is_locked(lock)) + spin_bug(lock); + if (lock->owner != get_current()) + spin_bug(lock); + lock->owner = NULL; + } #endif __asm__ __volatile__( spin_unlock_string @@ -107,6 +138,13 @@ "xchgb %b0,%1" :"=q" (oldval), "=m" (lock->lock) :"0" (0) : "memory"); +#ifdef CONFIG_DEBUG_SPINLOCK + if (oldval > 0 && lock->owner != SPIN_DONT_CHECK) { + if (lock->owner != 0) + spin_bug(lock); + lock->owner = get_current(); + } +#endif return oldval > 0; } @@ -117,12 +155,19 @@ here: if (unlikely(lock->magic != SPINLOCK_MAGIC)) { printk("eip: %p\n", &&here); - BUG(); + spin_bug(lock); } #endif __asm__ __volatile__( spin_lock_string :"=m" (lock->lock) : : "memory"); +#ifdef CONFIG_DEBUG_SPINLOCK + if (lock->owner != SPIN_DONT_CHECK) { + if (lock->owner != 0) + spin_bug(lock); + lock->owner = get_current(); + } +#endif } diff -Nru a/include/asm-i386/spinlock.h~spinlock-owner.diff b/include/asm-i386/spinlock.h~spinlock-owner.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-i386/spinlock.h~spinlock-owner.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,199 @@ +#ifndef __ASM_SPINLOCK_H +#define __ASM_SPINLOCK_H + +#include <asm/atomic.h> +#include <asm/rwlock.h> +#include <asm/page.h> +#include <linux/config.h> +#include <linux/compiler.h> + +extern int printk(const char * fmt, ...) + __attribute__ ((format (printf, 1, 2))); + +/* + * Your basic SMP spinlocks, allowing only a single CPU anywhere + */ + +typedef struct { + volatile unsigned int lock; +#ifdef CONFIG_DEBUG_SPINLOCK + unsigned magic; +#endif +} spinlock_t; + +#define SPINLOCK_MAGIC 0xdead4ead + +#ifdef CONFIG_DEBUG_SPINLOCK +#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC +#else +#define SPINLOCK_MAGIC_INIT /* */ +#endif + +#define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT } + +#define spin_lock_init(x) do { *(x) = SPIN_LOCK_UNLOCKED; } while(0) + +/* + * Simple spin lock operations. There are two variants, one clears IRQ's + * on the local processor, one does not. + * + * We make no fairness assumptions. They have a cost. + */ + +#define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) +#define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) + +#define spin_lock_string \ + "\n1:\t" \ + "lock ; decb %0\n\t" \ + "js 2f\n" \ + LOCK_SECTION_START("") \ + "2:\t" \ + "rep;nop\n\t" \ + "cmpb $0,%0\n\t" \ + "jle 2b\n\t" \ + "jmp 1b\n" \ + LOCK_SECTION_END + +/* + * This works. Despite all the confusion. + * (except on PPro SMP or if we are using OOSTORE) + * (PPro errata 66, 92) + */ + +#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE) + +#define spin_unlock_string \ + "movb $1,%0" \ + :"=m" (lock->lock) : : "memory" + + +static inline void _raw_spin_unlock(spinlock_t *lock) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + BUG_ON(lock->magic != SPINLOCK_MAGIC); + BUG_ON(!spin_is_locked(lock)); +#endif + __asm__ __volatile__( + spin_unlock_string + ); +} + +#else + +#define spin_unlock_string \ + "xchgb %b0, %1" \ + :"=q" (oldval), "=m" (lock->lock) \ + :"0" (oldval) : "memory" + +static inline void _raw_spin_unlock(spinlock_t *lock) +{ + char oldval = 1; +#ifdef CONFIG_DEBUG_SPINLOCK + BUG_ON(lock->magic != SPINLOCK_MAGIC); + BUG_ON(!spin_is_locked(lock)); +#endif + __asm__ __volatile__( + spin_unlock_string + ); +} + +#endif + +static inline int _raw_spin_trylock(spinlock_t *lock) +{ + char oldval; + __asm__ __volatile__( + "xchgb %b0,%1" + :"=q" (oldval), "=m" (lock->lock) + :"0" (0) : "memory"); + return oldval > 0; +} + +static inline void _raw_spin_lock(spinlock_t *lock) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + __label__ here; +here: + if (unlikely(lock->magic != SPINLOCK_MAGIC)) { + printk("eip: %p\n", &&here); + BUG(); + } +#endif + __asm__ __volatile__( + spin_lock_string + :"=m" (lock->lock) : : "memory"); +} + + +/* + * Read-write spinlocks, allowing multiple readers + * but only one writer. + * + * NOTE! it is quite common to have readers in interrupts + * but no interrupt writers. For those circumstances we + * can "mix" irq-safe locks - any writer needs to get a + * irq-safe write-lock, but readers can get non-irqsafe + * read-locks. + */ +typedef struct { + volatile unsigned int lock; +#ifdef CONFIG_DEBUG_SPINLOCK + unsigned magic; +#endif +} rwlock_t; + +#define RWLOCK_MAGIC 0xdeaf1eed + +#ifdef CONFIG_DEBUG_SPINLOCK +#define RWLOCK_MAGIC_INIT , RWLOCK_MAGIC +#else +#define RWLOCK_MAGIC_INIT /* */ +#endif + +#define RW_LOCK_UNLOCKED (rwlock_t) { RW_LOCK_BIAS RWLOCK_MAGIC_INIT } + +#define rwlock_init(x) do { *(x) = RW_LOCK_UNLOCKED; } while(0) + +#define rwlock_is_locked(x) ((x)->lock != RW_LOCK_BIAS) + +/* + * On x86, we implement read-write locks as a 32-bit counter + * with the high bit (sign) being the "contended" bit. + * + * The inline assembly is non-obvious. Think about it. + * + * Changed to use the same technique as rw semaphores. See + * semaphore.h for details. -ben + */ +/* the spinlock helpers are in arch/i386/kernel/semaphore.c */ + +static inline void _raw_read_lock(rwlock_t *rw) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + BUG_ON(rw->magic != RWLOCK_MAGIC); +#endif + __build_read_lock(rw, "__read_lock_failed"); +} + +static inline void _raw_write_lock(rwlock_t *rw) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + BUG_ON(rw->magic != RWLOCK_MAGIC); +#endif + __build_write_lock(rw, "__write_lock_failed"); +} + +#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") +#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") + +static inline int _raw_write_trylock(rwlock_t *lock) +{ + atomic_t *count = (atomic_t *)lock; + if (atomic_sub_and_test(RW_LOCK_BIAS, count)) + return 1; + atomic_add(RW_LOCK_BIAS, count); + return 0; +} + +#endif /* __ASM_SPINLOCK_H */ diff -Nru a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h --- a/include/asm-i386/unistd.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-i386/unistd.h Fri Oct 31 14:10:54 2003 @@ -279,8 +279,9 @@ #define __NR_utimes 271 #define __NR_fadvise64_64 272 #define __NR_vserver 273 +#define __NR_reiser4 274 -#define NR_syscalls 274 +#define NR_syscalls 275 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */ @@ -396,6 +397,7 @@ static inline _syscall1(int,close,int,fd) static inline _syscall1(int,_exit,int,exitcode) static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) +static inline _syscall1(long,reiser4,char*,p_strIng) #endif diff -Nru a/include/asm-i386/unistd.h~i386-sys_reiser4.diff b/include/asm-i386/unistd.h~i386-sys_reiser4.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-i386/unistd.h~i386-sys_reiser4.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,412 @@ +#ifndef _ASM_I386_UNISTD_H_ +#define _ASM_I386_UNISTD_H_ + +/* + * This file contains the system call numbers. + */ + +#define __NR_restart_syscall 0 +#define __NR_exit 1 +#define __NR_fork 2 +#define __NR_read 3 +#define __NR_write 4 +#define __NR_open 5 +#define __NR_close 6 +#define __NR_waitpid 7 +#define __NR_creat 8 +#define __NR_link 9 +#define __NR_unlink 10 +#define __NR_execve 11 +#define __NR_chdir 12 +#define __NR_time 13 +#define __NR_mknod 14 +#define __NR_chmod 15 +#define __NR_lchown 16 +#define __NR_break 17 +#define __NR_oldstat 18 +#define __NR_lseek 19 +#define __NR_getpid 20 +#define __NR_mount 21 +#define __NR_umount 22 +#define __NR_setuid 23 +#define __NR_getuid 24 +#define __NR_stime 25 +#define __NR_ptrace 26 +#define __NR_alarm 27 +#define __NR_oldfstat 28 +#define __NR_pause 29 +#define __NR_utime 30 +#define __NR_stty 31 +#define __NR_gtty 32 +#define __NR_access 33 +#define __NR_nice 34 +#define __NR_ftime 35 +#define __NR_sync 36 +#define __NR_kill 37 +#define __NR_rename 38 +#define __NR_mkdir 39 +#define __NR_rmdir 40 +#define __NR_dup 41 +#define __NR_pipe 42 +#define __NR_times 43 +#define __NR_prof 44 +#define __NR_brk 45 +#define __NR_setgid 46 +#define __NR_getgid 47 +#define __NR_signal 48 +#define __NR_geteuid 49 +#define __NR_getegid 50 +#define __NR_acct 51 +#define __NR_umount2 52 +#define __NR_lock 53 +#define __NR_ioctl 54 +#define __NR_fcntl 55 +#define __NR_mpx 56 +#define __NR_setpgid 57 +#define __NR_ulimit 58 +#define __NR_oldolduname 59 +#define __NR_umask 60 +#define __NR_chroot 61 +#define __NR_ustat 62 +#define __NR_dup2 63 +#define __NR_getppid 64 +#define __NR_getpgrp 65 +#define __NR_setsid 66 +#define __NR_sigaction 67 +#define __NR_sgetmask 68 +#define __NR_ssetmask 69 +#define __NR_setreuid 70 +#define __NR_setregid 71 +#define __NR_sigsuspend 72 +#define __NR_sigpending 73 +#define __NR_sethostname 74 +#define __NR_setrlimit 75 +#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */ +#define __NR_getrusage 77 +#define __NR_gettimeofday 78 +#define __NR_settimeofday 79 +#define __NR_getgroups 80 +#define __NR_setgroups 81 +#define __NR_select 82 +#define __NR_symlink 83 +#define __NR_oldlstat 84 +#define __NR_readlink 85 +#define __NR_uselib 86 +#define __NR_swapon 87 +#define __NR_reboot 88 +#define __NR_readdir 89 +#define __NR_mmap 90 +#define __NR_munmap 91 +#define __NR_truncate 92 +#define __NR_ftruncate 93 +#define __NR_fchmod 94 +#define __NR_fchown 95 +#define __NR_getpriority 96 +#define __NR_setpriority 97 +#define __NR_profil 98 +#define __NR_statfs 99 +#define __NR_fstatfs 100 +#define __NR_ioperm 101 +#define __NR_socketcall 102 +#define __NR_syslog 103 +#define __NR_setitimer 104 +#define __NR_getitimer 105 +#define __NR_stat 106 +#define __NR_lstat 107 +#define __NR_fstat 108 +#define __NR_olduname 109 +#define __NR_iopl 110 +#define __NR_vhangup 111 +#define __NR_idle 112 +#define __NR_vm86old 113 +#define __NR_wait4 114 +#define __NR_swapoff 115 +#define __NR_sysinfo 116 +#define __NR_ipc 117 +#define __NR_fsync 118 +#define __NR_sigreturn 119 +#define __NR_clone 120 +#define __NR_setdomainname 121 +#define __NR_uname 122 +#define __NR_modify_ldt 123 +#define __NR_adjtimex 124 +#define __NR_mprotect 125 +#define __NR_sigprocmask 126 +#define __NR_create_module 127 +#define __NR_init_module 128 +#define __NR_delete_module 129 +#define __NR_get_kernel_syms 130 +#define __NR_quotactl 131 +#define __NR_getpgid 132 +#define __NR_fchdir 133 +#define __NR_bdflush 134 +#define __NR_sysfs 135 +#define __NR_personality 136 +#define __NR_afs_syscall 137 /* Syscall for Andrew File System */ +#define __NR_setfsuid 138 +#define __NR_setfsgid 139 +#define __NR__llseek 140 +#define __NR_getdents 141 +#define __NR__newselect 142 +#define __NR_flock 143 +#define __NR_msync 144 +#define __NR_readv 145 +#define __NR_writev 146 +#define __NR_getsid 147 +#define __NR_fdatasync 148 +#define __NR__sysctl 149 +#define __NR_mlock 150 +#define __NR_munlock 151 +#define __NR_mlockall 152 +#define __NR_munlockall 153 +#define __NR_sched_setparam 154 +#define __NR_sched_getparam 155 +#define __NR_sched_setscheduler 156 +#define __NR_sched_getscheduler 157 +#define __NR_sched_yield 158 +#define __NR_sched_get_priority_max 159 +#define __NR_sched_get_priority_min 160 +#define __NR_sched_rr_get_interval 161 +#define __NR_nanosleep 162 +#define __NR_mremap 163 +#define __NR_setresuid 164 +#define __NR_getresuid 165 +#define __NR_vm86 166 +#define __NR_query_module 167 +#define __NR_poll 168 +#define __NR_nfsservctl 169 +#define __NR_setresgid 170 +#define __NR_getresgid 171 +#define __NR_prctl 172 +#define __NR_rt_sigreturn 173 +#define __NR_rt_sigaction 174 +#define __NR_rt_sigprocmask 175 +#define __NR_rt_sigpending 176 +#define __NR_rt_sigtimedwait 177 +#define __NR_rt_sigqueueinfo 178 +#define __NR_rt_sigsuspend 179 +#define __NR_pread64 180 +#define __NR_pwrite64 181 +#define __NR_chown 182 +#define __NR_getcwd 183 +#define __NR_capget 184 +#define __NR_capset 185 +#define __NR_sigaltstack 186 +#define __NR_sendfile 187 +#define __NR_getpmsg 188 /* some people actually want streams */ +#define __NR_putpmsg 189 /* some people actually want streams */ +#define __NR_vfork 190 +#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */ +#define __NR_mmap2 192 +#define __NR_truncate64 193 +#define __NR_ftruncate64 194 +#define __NR_stat64 195 +#define __NR_lstat64 196 +#define __NR_fstat64 197 +#define __NR_lchown32 198 +#define __NR_getuid32 199 +#define __NR_getgid32 200 +#define __NR_geteuid32 201 +#define __NR_getegid32 202 +#define __NR_setreuid32 203 +#define __NR_setregid32 204 +#define __NR_getgroups32 205 +#define __NR_setgroups32 206 +#define __NR_fchown32 207 +#define __NR_setresuid32 208 +#define __NR_getresuid32 209 +#define __NR_setresgid32 210 +#define __NR_getresgid32 211 +#define __NR_chown32 212 +#define __NR_setuid32 213 +#define __NR_setgid32 214 +#define __NR_setfsuid32 215 +#define __NR_setfsgid32 216 +#define __NR_pivot_root 217 +#define __NR_mincore 218 +#define __NR_madvise 219 +#define __NR_madvise1 219 /* delete when C lib stub is removed */ +#define __NR_getdents64 220 +#define __NR_fcntl64 221 +/* 223 is unused */ +#define __NR_gettid 224 +#define __NR_readahead 225 +#define __NR_setxattr 226 +#define __NR_lsetxattr 227 +#define __NR_fsetxattr 228 +#define __NR_getxattr 229 +#define __NR_lgetxattr 230 +#define __NR_fgetxattr 231 +#define __NR_listxattr 232 +#define __NR_llistxattr 233 +#define __NR_flistxattr 234 +#define __NR_removexattr 235 +#define __NR_lremovexattr 236 +#define __NR_fremovexattr 237 +#define __NR_tkill 238 +#define __NR_sendfile64 239 +#define __NR_futex 240 +#define __NR_sched_setaffinity 241 +#define __NR_sched_getaffinity 242 +#define __NR_set_thread_area 243 +#define __NR_get_thread_area 244 +#define __NR_io_setup 245 +#define __NR_io_destroy 246 +#define __NR_io_getevents 247 +#define __NR_io_submit 248 +#define __NR_io_cancel 249 +#define __NR_fadvise64 250 + +#define __NR_exit_group 252 +#define __NR_lookup_dcookie 253 +#define __NR_epoll_create 254 +#define __NR_epoll_ctl 255 +#define __NR_epoll_wait 256 +#define __NR_remap_file_pages 257 +#define __NR_set_tid_address 258 +#define __NR_timer_create 259 +#define __NR_timer_settime (__NR_timer_create+1) +#define __NR_timer_gettime (__NR_timer_create+2) +#define __NR_timer_getoverrun (__NR_timer_create+3) +#define __NR_timer_delete (__NR_timer_create+4) +#define __NR_clock_settime (__NR_timer_create+5) +#define __NR_clock_gettime (__NR_timer_create+6) +#define __NR_clock_getres (__NR_timer_create+7) +#define __NR_clock_nanosleep (__NR_timer_create+8) +#define __NR_statfs64 268 +#define __NR_fstatfs64 269 +#define __NR_tgkill 270 +#define __NR_utimes 271 +#define __NR_fadvise64_64 272 +#define __NR_vserver 273 + +#define NR_syscalls 274 + +/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */ + +#define __syscall_return(type, res) \ +do { \ + if ((unsigned long)(res) >= (unsigned long)(-125)) { \ + errno = -(res); \ + res = -1; \ + } \ + return (type) (res); \ +} while (0) + +/* XXX - _foo needs to be __foo, while __NR_bar could be _NR_bar. */ +#define _syscall0(type,name) \ +type name(void) \ +{ \ +long __res; \ +__asm__ volatile ("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name)); \ +__syscall_return(type,__res); \ +} + +#define _syscall1(type,name,type1,arg1) \ +type name(type1 arg1) \ +{ \ +long __res; \ +__asm__ volatile ("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name),"b" ((long)(arg1))); \ +__syscall_return(type,__res); \ +} + +#define _syscall2(type,name,type1,arg1,type2,arg2) \ +type name(type1 arg1,type2 arg2) \ +{ \ +long __res; \ +__asm__ volatile ("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name),"b" ((long)(arg1)),"c" ((long)(arg2))); \ +__syscall_return(type,__res); \ +} + +#define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \ +type name(type1 arg1,type2 arg2,type3 arg3) \ +{ \ +long __res; \ +__asm__ volatile ("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name),"b" ((long)(arg1)),"c" ((long)(arg2)), \ + "d" ((long)(arg3))); \ +__syscall_return(type,__res); \ +} + +#define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ +type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4) \ +{ \ +long __res; \ +__asm__ volatile ("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name),"b" ((long)(arg1)),"c" ((long)(arg2)), \ + "d" ((long)(arg3)),"S" ((long)(arg4))); \ +__syscall_return(type,__res); \ +} + +#define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ +type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5) \ +{ \ +long __res; \ +__asm__ volatile ("int $0x80" \ + : "=a" (__res) \ + : "0" (__NR_##name),"b" ((long)(arg1)),"c" ((long)(arg2)), \ + "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5))); \ +__syscall_return(type,__res); \ +} + +#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ +type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5,type6 arg6) \ +{ \ +long __res; \ +__asm__ volatile ("push %%ebp ; movl %%eax,%%ebp ; movl %1,%%eax ; int $0x80 ; pop %%ebp" \ + : "=a" (__res) \ + : "i" (__NR_##name),"b" ((long)(arg1)),"c" ((long)(arg2)), \ + "d" ((long)(arg3)),"S" ((long)(arg4)),"D" ((long)(arg5)), \ + "0" ((long)(arg6))); \ +__syscall_return(type,__res); \ +} + +#ifdef __KERNEL_SYSCALLS__ + +/* + * we need this inline - forking from kernel space will result + * in NO COPY ON WRITE (!!!), until an execve is executed. This + * is no problem, but for the stack. This is handled by not letting + * main() use the stack at all after fork(). Thus, no function + * calls - which means inline code for fork too, as otherwise we + * would use the stack upon exit from 'fork()'. + * + * Actually only pause and fork are needed inline, so that there + * won't be any messing with the stack from main(), but we define + * some others too. + */ +#define __NR__exit __NR_exit +static inline _syscall0(pid_t,setsid) +static inline _syscall3(int,write,int,fd,const char *,buf,off_t,count) +static inline _syscall3(int,read,int,fd,char *,buf,off_t,count) +static inline _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) +static inline _syscall1(int,dup,int,fd) +static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) +static inline _syscall3(int,open,const char *,file,int,flag,int,mode) +static inline _syscall1(int,close,int,fd) +static inline _syscall1(int,_exit,int,exitcode) +static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) + +#endif + +/* + * "Conditional" syscalls + * + * What we want is __attribute__((weak,alias("sys_ni_syscall"))), + * but it doesn't work on all toolchains, so we just do it by hand + */ +#ifndef cond_syscall +#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall"); +#endif + +#endif /* _ASM_I386_UNISTD_H_ */ diff -Nru a/include/asm-ia64/io.h b/include/asm-ia64/io.h --- a/include/asm-ia64/io.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-ia64/io.h Fri Oct 31 14:10:53 2003 @@ -72,6 +72,9 @@ return (void *) (address + PAGE_OFFSET); } +#define ARCH_HAS_VALID_PHYS_ADDR_RANGE +extern int valid_phys_addr_range (unsigned long addr, size_t *count); /* efi.c */ + /* * The following two macros are deprecated and scheduled for removal. * Please use the PCI-DMA interface defined in <asm/pci.h> instead. diff -Nru a/include/asm-ia64/module.h b/include/asm-ia64/module.h --- a/include/asm-ia64/module.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-ia64/module.h Fri Oct 31 14:10:53 2003 @@ -18,7 +18,8 @@ struct elf64_shdr *unwind; /* unwind-table section */ unsigned long gp; /* global-pointer for module */ - void *unw_table; /* unwind-table cookie returned by unwinder */ + void *core_unw_table; /* core unwind-table cookie returned by unwinder */ + void *init_unw_table; /* init unwind-table cookie returned by unwinder */ unsigned int next_got_entry; /* index of next available got entry */ }; diff -Nru a/include/asm-ia64/namei.h b/include/asm-ia64/namei.h --- a/include/asm-ia64/namei.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-ia64/namei.h Fri Oct 31 14:10:54 2003 @@ -9,7 +9,7 @@ #include <asm/ptrace.h> #include <asm/system.h> -#define EMUL_PREFIX_LINUX_IA32 "emul/ia32-linux/" +#define EMUL_PREFIX_LINUX_IA32 "/emul/ia32-linux/" static inline char * __emul_prefix (void) diff -Nru a/include/asm-ia64/unwind.h b/include/asm-ia64/unwind.h --- a/include/asm-ia64/unwind.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-ia64/unwind.h Fri Oct 31 14:10:54 2003 @@ -93,6 +93,12 @@ * The official API follows below: */ +struct unw_table_entry { + u64 start_offset; + u64 end_offset; + u64 info_offset; +}; + /* * Initialize unwind support. */ diff -Nru a/include/asm-sparc/ioctl.h b/include/asm-sparc/ioctl.h --- a/include/asm-sparc/ioctl.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-sparc/ioctl.h Fri Oct 31 14:10:54 2003 @@ -54,7 +54,9 @@ (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) ) #define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) #define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) -#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_XSIZEMASK) +#define _IOC_SIZE(nr) \ + ((((((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) & (_IOC_WRITE|_IOC_READ)) == 0)? \ + 0: (((nr) >> _IOC_SIZESHIFT) & _IOC_XSIZEMASK)) /* ...and for the PCMCIA and sound. */ #define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) diff -Nru a/include/asm-sparc/namei.h b/include/asm-sparc/namei.h --- a/include/asm-sparc/namei.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-sparc/namei.h Fri Oct 31 14:10:53 2003 @@ -8,8 +8,8 @@ #ifndef __SPARC_NAMEI_H #define __SPARC_NAMEI_H -#define SPARC_BSD_EMUL "usr/gnemul/sunos/" -#define SPARC_SOL_EMUL "usr/gnemul/solaris/" +#define SPARC_BSD_EMUL "/usr/gnemul/sunos/" +#define SPARC_SOL_EMUL "/usr/gnemul/solaris/" static inline char * __emul_prefix(void) { diff -Nru a/include/asm-sparc/unistd.h b/include/asm-sparc/unistd.h --- a/include/asm-sparc/unistd.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-sparc/unistd.h Fri Oct 31 14:10:53 2003 @@ -284,10 +284,15 @@ #define __NR_timer_delete 265 #define __NR_timer_create 266 /* #define __NR_vserver 267 Reserved for VSERVER */ -/* WARNING: You MAY NOT add syscall numbers larger than 267, since +#define __NR_io_setup 268 +#define __NR_io_destroy 268 +#define __NR_io_submit 269 +#define __NR_io_cancel 270 +#define __NR_io_getevents 271 +/* WARNING: You MAY NOT add syscall numbers larger than 271, since * all of the syscall tables in the Sparc kernel are - * sized to have 267 entries (starting at zero). Therefore - * find a free slot in the 0-266 range. + * sized to have 272 entries (starting at zero). Therefore + * find a free slot in the 0-271 range. */ #define _syscall0(type,name) \ diff -Nru a/include/asm-sparc64/hardirq.h b/include/asm-sparc64/hardirq.h --- a/include/asm-sparc64/hardirq.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-sparc64/hardirq.h Fri Oct 31 14:10:54 2003 @@ -79,7 +79,8 @@ #define irq_enter() (preempt_count() += HARDIRQ_OFFSET) #ifdef CONFIG_PREEMPT -# define in_atomic() (preempt_count() != kernel_locked()) +# include <linux/smp_lock.h> +# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else # define in_atomic() (preempt_count() != 0) diff -Nru a/include/asm-sparc64/ioctl.h b/include/asm-sparc64/ioctl.h --- a/include/asm-sparc64/ioctl.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-sparc64/ioctl.h Fri Oct 31 14:10:54 2003 @@ -54,7 +54,9 @@ (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) ) #define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) #define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) -#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_XSIZEMASK) +#define _IOC_SIZE(nr) \ + ((((((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) & (_IOC_WRITE|_IOC_READ)) == 0)? \ + 0: (((nr) >> _IOC_SIZESHIFT) & _IOC_XSIZEMASK)) /* ...and for the PCMCIA and sound. */ #define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) diff -Nru a/include/asm-sparc64/namei.h b/include/asm-sparc64/namei.h --- a/include/asm-sparc64/namei.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-sparc64/namei.h Fri Oct 31 14:10:53 2003 @@ -8,8 +8,8 @@ #ifndef __SPARC64_NAMEI_H #define __SPARC64_NAMEI_H -#define SPARC_BSD_EMUL "usr/gnemul/sunos/" -#define SPARC_SOL_EMUL "usr/gnemul/solaris/" +#define SPARC_BSD_EMUL "/usr/gnemul/sunos/" +#define SPARC_SOL_EMUL "/usr/gnemul/solaris/" static inline char * __emul_prefix(void) { diff -Nru a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h --- a/include/asm-sparc64/spinlock.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-sparc64/spinlock.h Fri Oct 31 14:10:53 2003 @@ -118,11 +118,13 @@ extern void __read_unlock(rwlock_t *); extern void __write_lock(rwlock_t *); extern void __write_unlock(rwlock_t *); +extern int __write_trylock(rwlock_t *); #define _raw_read_lock(p) __read_lock(p) #define _raw_read_unlock(p) __read_unlock(p) #define _raw_write_lock(p) __write_lock(p) #define _raw_write_unlock(p) __write_unlock(p) +#define _raw_write_trylock(p) __write_trylock(p) #else /* !(CONFIG_DEBUG_SPINLOCK) */ diff -Nru a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h --- a/include/asm-sparc64/unistd.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-sparc64/unistd.h Fri Oct 31 14:10:53 2003 @@ -286,10 +286,15 @@ #define __NR_timer_delete 265 #define __NR_timer_create 266 /* #define __NR_vserver 267 Reserved for VSERVER */ -/* WARNING: You MAY NOT add syscall numbers larger than 267, since +#define __NR_io_setup 268 +#define __NR_io_destroy 268 +#define __NR_io_submit 269 +#define __NR_io_cancel 270 +#define __NR_io_getevents 271 +/* WARNING: You MAY NOT add syscall numbers larger than 271, since * all of the syscall tables in the Sparc kernel are - * sized to have 267 entries (starting at zero). Therefore - * find a free slot in the 0-266 range. + * sized to have 272 entries (starting at zero). Therefore + * find a free slot in the 0-271 range. */ #define _syscall0(type,name) \ diff -Nru a/include/asm-um/archparam-i386.h b/include/asm-um/archparam-i386.h --- a/include/asm-um/archparam-i386.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-um/archparam-i386.h Fri Oct 31 14:10:54 2003 @@ -56,6 +56,65 @@ pr_reg[16] = PT_REGS_SS(regs); \ } while(0); +#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL)) +#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) +#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) +extern void *__kernel_vsyscall; + +/* + * Architecture-neutral AT_ values in 0-17, leave some room + * for more of them, start the x86-specific ones at 32. + */ +#define AT_SYSINFO 32 +#define AT_SYSINFO_EHDR 33 + +#define ARCH_DLINFO \ +do { \ + NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ +} while (0) + +/* + * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out + * extra segments containing the vsyscall DSO contents. Dumping its + * contents makes post-mortem fully interpretable later without matching up + * the same kernel and hardware config to see what PC values meant. + * Dumping its extra ELF program headers includes all the other information + * a debugger needs to easily find how the vsyscall DSO was being used. + */ +#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum) +#define ELF_CORE_WRITE_EXTRA_PHDRS \ +do { \ + const struct elf_phdr *const vsyscall_phdrs = \ + (const struct elf_phdr *) (VSYSCALL_BASE \ + + VSYSCALL_EHDR->e_phoff); \ + int i; \ + Elf32_Off ofs = 0; \ + for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ + struct elf_phdr phdr = vsyscall_phdrs[i]; \ + if (phdr.p_type == PT_LOAD) { \ + ofs = phdr.p_offset = offset; \ + offset += phdr.p_filesz; \ + } \ + else \ + phdr.p_offset += ofs; \ + phdr.p_paddr = 0; /* match other core phdrs */ \ + DUMP_WRITE(&phdr, sizeof(phdr)); \ + } \ +} while (0) +#define ELF_CORE_WRITE_EXTRA_DATA \ +do { \ + const struct elf_phdr *const vsyscall_phdrs = \ + (const struct elf_phdr *) (VSYSCALL_BASE \ + + VSYSCALL_EHDR->e_phoff); \ + int i; \ + for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ + if (vsyscall_phdrs[i].p_type == PT_LOAD) \ + DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \ + vsyscall_phdrs[i].p_filesz); \ + } \ +} while (0) + /********* Bits for asm-um/delay.h **********/ typedef unsigned long um_udelay_t; diff -Nru a/include/asm-um/archparam-i386.h~uml-summa.diff b/include/asm-um/archparam-i386.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/archparam-i386.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2000, 2001 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_ARCHPARAM_I386_H +#define __UM_ARCHPARAM_I386_H + +/********* Bits for asm-um/elf.h ************/ + +#include "user.h" + +#define ELF_PLATFORM "i586" + +#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) + +typedef struct user_i387_struct elf_fpregset_t; +typedef unsigned long elf_greg_t; + +#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t)) +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; + +#define ELF_DATA ELFDATA2LSB +#define ELF_ARCH EM_386 + +#define ELF_PLAT_INIT(regs, load_addr) do { \ + PT_REGS_EBX(regs) = 0; \ + PT_REGS_ECX(regs) = 0; \ + PT_REGS_EDX(regs) = 0; \ + PT_REGS_ESI(regs) = 0; \ + PT_REGS_EDI(regs) = 0; \ + PT_REGS_EBP(regs) = 0; \ + PT_REGS_EAX(regs) = 0; \ +} while(0) + +/* Shamelessly stolen from include/asm-i386/elf.h */ + +#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ + pr_reg[0] = PT_REGS_EBX(regs); \ + pr_reg[1] = PT_REGS_ECX(regs); \ + pr_reg[2] = PT_REGS_EDX(regs); \ + pr_reg[3] = PT_REGS_ESI(regs); \ + pr_reg[4] = PT_REGS_EDI(regs); \ + pr_reg[5] = PT_REGS_EBP(regs); \ + pr_reg[6] = PT_REGS_EAX(regs); \ + pr_reg[7] = PT_REGS_DS(regs); \ + pr_reg[8] = PT_REGS_ES(regs); \ + /* fake once used fs and gs selectors? */ \ + pr_reg[9] = PT_REGS_DS(regs); \ + pr_reg[10] = PT_REGS_DS(regs); \ + pr_reg[11] = PT_REGS_SYSCALL_NR(regs); \ + pr_reg[12] = PT_REGS_IP(regs); \ + pr_reg[13] = PT_REGS_CS(regs); \ + pr_reg[14] = PT_REGS_EFLAGS(regs); \ + pr_reg[15] = PT_REGS_SP(regs); \ + pr_reg[16] = PT_REGS_SS(regs); \ +} while(0); + +/********* Bits for asm-um/delay.h **********/ + +typedef unsigned long um_udelay_t; + +/********* Nothing for asm-um/hardirq.h **********/ + +/********* Nothing for asm-um/hw_irq.h **********/ + +/********* Nothing for asm-um/string.h **********/ + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/include/asm-um/bug.h b/include/asm-um/bug.h --- a/include/asm-um/bug.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-um/bug.h Fri Oct 31 14:10:53 2003 @@ -1,30 +1,19 @@ #ifndef __UM_BUG_H #define __UM_BUG_H -#ifndef __ASSEMBLY__ +#include "arch/bug.h" + +#undef BUG +#undef PAGE_BUG #define BUG() do { \ - panic("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ + dump_stack(); \ + panic("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ } while (0) -#define BUG_ON(condition) do { \ - if (unlikely((condition)!=0)) \ - BUG(); \ -} while(0) - #define PAGE_BUG(page) do { \ - BUG(); \ + BUG(); \ } while (0) -#define WARN_ON(condition) do { \ - if (unlikely((condition)!=0)) { \ - printk("Badness in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \ - dump_stack(); \ - } \ -} while (0) - -extern int foo; - -#endif #endif diff -Nru a/include/asm-um/bug.h~uml-summa.diff b/include/asm-um/bug.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/bug.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,30 @@ +#ifndef __UM_BUG_H +#define __UM_BUG_H + +#ifndef __ASSEMBLY__ + +#define BUG() do { \ + panic("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ +} while (0) + +#define BUG_ON(condition) do { \ + if (unlikely((condition)!=0)) \ + BUG(); \ +} while(0) + +#define PAGE_BUG(page) do { \ + BUG(); \ +} while (0) + +#define WARN_ON(condition) do { \ + if (unlikely((condition)!=0)) { \ + printk("Badness in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \ + dump_stack(); \ + } \ +} while (0) + +extern int foo; + +#endif + +#endif diff -Nru a/include/asm-um/common.lds.S b/include/asm-um/common.lds.S --- a/include/asm-um/common.lds.S Fri Oct 31 14:10:53 2003 +++ b/include/asm-um/common.lds.S Fri Oct 31 14:10:53 2003 @@ -1,3 +1,5 @@ +#include <asm-generic/vmlinux.lds.h> + .fini : { *(.fini) } =0x9090 _etext = .; PROVIDE (etext = .); @@ -13,14 +15,6 @@ RODATA - __start___ksymtab = .; /* Kernel symbol table */ - __ksymtab : { *(__ksymtab) } - __stop___ksymtab = .; - - __start___gpl_ksymtab = .; /* Kernel symbol table: GPL-only symbols */ - __gpl_ksymtab : { *(__gpl_ksymtab) } - __stop___gpl_ksymtab = .; - __start___kallsyms = .; /* All kernel symbols */ __kallsyms : { *(__kallsyms) } __stop___kallsyms = .; @@ -67,6 +61,12 @@ } __initcall_end = .; + __con_initcall_start = .; + .con_initcall.init : { *(.con_initcall.init) } + __con_initcall_end = .; + + SECURITY_INIT + __uml_initcall_start = .; .uml.initcall.init : { *(.uml.initcall.init) } __uml_initcall_end = .; @@ -80,7 +80,33 @@ .uml.exitcall : { *(.uml.exitcall.exit) } __uml_exitcall_end = .; - . = ALIGN(4096); + . = ALIGN(4); + __alt_instructions = .; + .altinstructions : { *(.altinstructions) } + __alt_instructions_end = .; + .altinstr_replacement : { *(.altinstr_replacement) } + /* .exit.text is discard at runtime, not link time, to deal with references + from .altinstructions and .eh_frame */ + .exit.text : { *(.exit.text) } + .exit.data : { *(.exit.data) } + + __preinit_array_start = .; + .preinit_array : { *(.preinit_array) } + __preinit_array_end = .; + __init_array_start = .; + .init_array : { *(.init_array) } + __init_array_end = .; + __fini_array_start = .; + .fini_array : { *(.fini_array) } + __fini_array_end = .; + + . = ALIGN(4096); __initramfs_start = .; .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + } + diff -Nru a/include/asm-um/common.lds.S~uml-summa.diff b/include/asm-um/common.lds.S~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/common.lds.S~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,86 @@ + .fini : { *(.fini) } =0x9090 + _etext = .; + PROVIDE (etext = .); + + . = ALIGN(4096); + _sdata = .; + PROVIDE (sdata = .); + + . = ALIGN(16); /* Exception table */ + __start___ex_table = .; + __ex_table : { *(__ex_table) } + __stop___ex_table = .; + + RODATA + + __start___ksymtab = .; /* Kernel symbol table */ + __ksymtab : { *(__ksymtab) } + __stop___ksymtab = .; + + __start___gpl_ksymtab = .; /* Kernel symbol table: GPL-only symbols */ + __gpl_ksymtab : { *(__gpl_ksymtab) } + __stop___gpl_ksymtab = .; + + __start___kallsyms = .; /* All kernel symbols */ + __kallsyms : { *(__kallsyms) } + __stop___kallsyms = .; + + .unprotected : { *(.unprotected) } + . = ALIGN(4096); + PROVIDE (_unprotected_end = .); + + . = ALIGN(4096); + __uml_setup_start = .; + .uml.setup.init : { *(.uml.setup.init) } + __uml_setup_end = .; + + __uml_help_start = .; + .uml.help.init : { *(.uml.help.init) } + __uml_help_end = .; + + __uml_postsetup_start = .; + .uml.postsetup.init : { *(.uml.postsetup.init) } + __uml_postsetup_end = .; + + __setup_start = .; + .init.setup : { *(.init.setup) } + __setup_end = .; + + __start___param = .; + __param : { *(__param) } + __stop___param = .; + + . = ALIGN(32); + __per_cpu_start = . ; + .data.percpu : { *(.data.percpu) } + __per_cpu_end = . ; + + __initcall_start = .; + .initcall.init : { + *(.initcall1.init) + *(.initcall2.init) + *(.initcall3.init) + *(.initcall4.init) + *(.initcall5.init) + *(.initcall6.init) + *(.initcall7.init) + } + __initcall_end = .; + + __uml_initcall_start = .; + .uml.initcall.init : { *(.uml.initcall.init) } + __uml_initcall_end = .; + __init_end = .; + + __exitcall_begin = .; + .exitcall : { *(.exitcall.exit) } + __exitcall_end = .; + + __uml_exitcall_begin = .; + .uml.exitcall : { *(.uml.exitcall.exit) } + __uml_exitcall_end = .; + + . = ALIGN(4096); + __initramfs_start = .; + .init.ramfs : { *(.init.ramfs) } + __initramfs_end = .; diff -Nru a/include/asm-um/cpufeature.h b/include/asm-um/cpufeature.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/cpufeature.h Fri Oct 31 14:10:54 2003 @@ -0,0 +1,6 @@ +#ifndef _ASM_UM_CPUFEATUER_H +#define _ASM_UM_CPUFEATUER_H + +#include "asm/arch/cpufeature.h" + +#endif diff -Nru a/include/asm-um/current.h b/include/asm-um/current.h --- a/include/asm-um/current.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-um/current.h Fri Oct 31 14:10:53 2003 @@ -16,10 +16,14 @@ #define CURRENT_THREAD(dummy) (((unsigned long) &dummy) & \ (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER)) -#define current ({ int dummy; \ - ((struct thread_info *) CURRENT_THREAD(dummy))->task; }) +#define current_thread \ + ({ int dummy; ((struct thread_info *) CURRENT_THREAD(dummy)); }) + +#define current (current_thread->task) #endif /* __ASSEMBLY__ */ + +extern void *get_current(void); #endif diff -Nru a/include/asm-um/current.h~uml-summa.diff b/include/asm-um/current.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/current.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_CURRENT_H +#define __UM_CURRENT_H + +#ifndef __ASSEMBLY__ + +struct thread_info; + +#include "linux/config.h" +#include "asm/page.h" + +#define CURRENT_THREAD(dummy) (((unsigned long) &dummy) & \ + (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER)) + +#define current ({ int dummy; \ + ((struct thread_info *) CURRENT_THREAD(dummy))->task; }) + +#endif /* __ASSEMBLY__ */ + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/include/asm-um/fixmap.h b/include/asm-um/fixmap.h --- a/include/asm-um/fixmap.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-um/fixmap.h Fri Oct 31 14:10:54 2003 @@ -3,6 +3,7 @@ #include <linux/config.h> #include <asm/kmap_types.h> +#include <asm/bug.h> /* * Here we define all the compile-time 'special' virtual @@ -34,6 +35,7 @@ FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, #endif + FIX_VSYSCALL, __end_of_fixed_addresses }; @@ -62,6 +64,13 @@ #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +/* + * This is the range that is readable by user mode, and things + * acting like user mode such as get_user_pages. + */ +#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL)) +#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) extern void __this_fixmap_does_not_exist(void); diff -Nru a/include/asm-um/fixmap.h~uml-summa.diff b/include/asm-um/fixmap.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/fixmap.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,96 @@ +#ifndef __UM_FIXMAP_H +#define __UM_FIXMAP_H + +#include <linux/config.h> +#include <asm/kmap_types.h> + +/* + * Here we define all the compile-time 'special' virtual + * addresses. The point is to have a constant address at + * compile time, but to set the physical address only + * in the boot process. We allocate these special addresses + * from the end of virtual memory (0xfffff000) backwards. + * Also this lets us do fail-safe vmalloc(), we + * can guarantee that these special addresses and + * vmalloc()-ed addresses never overlap. + * + * these 'compile-time allocated' memory buffers are + * fixed-size 4k pages. (or larger if used with an increment + * highger than 1) use fixmap_set(idx,phys) to associate + * physical memory with fixmap indices. + * + * TLB entries of such buffers will not be flushed across + * task switches. + */ + +/* + * on UP currently we will have no trace of the fixmap mechanizm, + * no page table allocations, etc. This might change in the + * future, say framebuffers for the console driver(s) could be + * fix-mapped? + */ +enum fixed_addresses { +#ifdef CONFIG_HIGHMEM + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, +#endif + __end_of_fixed_addresses +}; + +extern void __set_fixmap (enum fixed_addresses idx, + unsigned long phys, pgprot_t flags); + +#define set_fixmap(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL) +/* + * Some hardware wants to get fixmapped without caching. + */ +#define set_fixmap_nocache(idx, phys) \ + __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE) +/* + * used by vmalloc.c. + * + * Leave one empty page between vmalloc'ed areas and + * the start of the fixmap, and leave one page empty + * at the top of mem.. + */ +extern unsigned long get_kmem_end(void); + +#define FIXADDR_TOP (get_kmem_end() - 0x2000) +#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) + +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + +extern void __this_fixmap_does_not_exist(void); + +/* + * 'index to address' translation. If anyone tries to use the idx + * directly without tranlation, we catch the bug with a NULL-deference + * kernel oops. Illegal ranges of incoming indices are caught too. + */ +static inline unsigned long fix_to_virt(const unsigned int idx) +{ + /* + * this branch gets completely eliminated after inlining, + * except when someone tries to use fixaddr indices in an + * illegal way. (such as mixing up address types or using + * out-of-range indices). + * + * If it doesn't get removed, the linker will complain + * loudly with a reasonably clear error message.. + */ + if (idx >= __end_of_fixed_addresses) + __this_fixmap_does_not_exist(); + + return __fix_to_virt(idx); +} + +static inline unsigned long virt_to_fix(const unsigned long vaddr) +{ + BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); + return __virt_to_fix(vaddr); +} + +#endif diff -Nru a/include/asm-um/irq.h b/include/asm-um/irq.h --- a/include/asm-um/irq.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-um/irq.h Fri Oct 31 14:10:54 2003 @@ -1,15 +1,6 @@ #ifndef __UM_IRQ_H #define __UM_IRQ_H -/* The i386 irq.h has a struct task_struct in a prototype without including - * sched.h. This forward declaration kills the resulting warning. - */ -struct task_struct; - -#include "asm/ptrace.h" - -#undef NR_IRQS - #define TIMER_IRQ 0 #define UMN_IRQ 1 #define CONSOLE_IRQ 2 @@ -28,8 +19,4 @@ #define LAST_IRQ XTERM_IRQ #define NR_IRQS (LAST_IRQ + 1) -extern int um_request_irq(unsigned int irq, int fd, int type, - void (*handler)(int, void *, struct pt_regs *), - unsigned long irqflags, const char * devname, - void *dev_id); #endif diff -Nru a/include/asm-um/irq.h~uml-summa.diff b/include/asm-um/irq.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/irq.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,35 @@ +#ifndef __UM_IRQ_H +#define __UM_IRQ_H + +/* The i386 irq.h has a struct task_struct in a prototype without including + * sched.h. This forward declaration kills the resulting warning. + */ +struct task_struct; + +#include "asm/ptrace.h" + +#undef NR_IRQS + +#define TIMER_IRQ 0 +#define UMN_IRQ 1 +#define CONSOLE_IRQ 2 +#define CONSOLE_WRITE_IRQ 3 +#define UBD_IRQ 4 +#define UM_ETH_IRQ 5 +#define SSL_IRQ 6 +#define SSL_WRITE_IRQ 7 +#define ACCEPT_IRQ 8 +#define MCONSOLE_IRQ 9 +#define WINCH_IRQ 10 +#define SIGIO_WRITE_IRQ 11 +#define TELNETD_IRQ 12 +#define XTERM_IRQ 13 + +#define LAST_IRQ XTERM_IRQ +#define NR_IRQS (LAST_IRQ + 1) + +extern int um_request_irq(unsigned int irq, int fd, int type, + void (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, const char * devname, + void *dev_id); +#endif diff -Nru a/include/asm-um/local.h b/include/asm-um/local.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/local.h Fri Oct 31 14:10:54 2003 @@ -0,0 +1,6 @@ +#ifndef _ASM_UM_LOCAL_H +#define _ASM_UM_LOCAL_H + +#include "asm/arch/local.h" + +#endif diff -Nru a/include/asm-um/module-i386.h b/include/asm-um/module-i386.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/module-i386.h Fri Oct 31 14:10:55 2003 @@ -0,0 +1,14 @@ +#ifndef __UM_MODULE_I386_H +#define __UM_MODULE_I386_H + +/* UML is simple */ +struct mod_arch_specific +{ +}; + +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Ehdr Elf32_Ehdr + +/* __UM_MODULE_I386_H */ +#endif diff -Nru a/include/asm-um/page.h b/include/asm-um/page.h --- a/include/asm-um/page.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-um/page.h Fri Oct 31 14:10:54 2003 @@ -4,7 +4,6 @@ struct page; #include "asm/arch/page.h" -#include "asm/bug.h" #undef __pa #undef __va diff -Nru a/include/asm-um/page.h~uml-summa.diff b/include/asm-um/page.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/page.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,48 @@ +#ifndef __UM_PAGE_H +#define __UM_PAGE_H + +struct page; + +#include "asm/arch/page.h" +#include "asm/bug.h" + +#undef __pa +#undef __va +#undef pfn_to_page +#undef page_to_pfn +#undef virt_to_page +#undef pfn_valid +#undef virt_addr_valid +#undef VALID_PAGE +#undef PAGE_OFFSET +#undef KERNELBASE + +extern unsigned long uml_physmem; + +#define PAGE_OFFSET (uml_physmem) +#define KERNELBASE PAGE_OFFSET + +#define __va_space (8*1024*1024) + +extern unsigned long region_pa(void *virt); +extern void *region_va(unsigned long phys); + +#define __pa(virt) region_pa((void *) (virt)) +#define __va(phys) region_va((unsigned long) (phys)) + +extern unsigned long page_to_pfn(struct page *page); +extern struct page *pfn_to_page(unsigned long pfn); + +extern struct page *phys_to_page(unsigned long phys); + +#define virt_to_page(v) (phys_to_page(__pa(v))) + +extern struct page *page_mem_map(struct page *page); + +#define pfn_valid(pfn) (page_mem_map(pfn_to_page(pfn)) != NULL) +#define virt_addr_valid(v) pfn_valid(__pa(v) >> PAGE_SHIFT) + +extern struct page *arch_validate(struct page *page, int mask, int order); +#define HAVE_ARCH_VALIDATE + +#endif diff -Nru a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h --- a/include/asm-um/pgtable.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-um/pgtable.h Fri Oct 31 14:10:54 2003 @@ -78,12 +78,13 @@ #define _PAGE_PRESENT 0x001 #define _PAGE_NEWPAGE 0x002 -#define _PAGE_PROTNONE 0x004 /* If not present */ -#define _PAGE_RW 0x008 -#define _PAGE_USER 0x010 -#define _PAGE_ACCESSED 0x020 -#define _PAGE_DIRTY 0x040 -#define _PAGE_NEWPROT 0x080 +#define _PAGE_NEWPROT 0x004 +#define _PAGE_FILE 0x008 /* set:pagecache unset:swap */ +#define _PAGE_PROTNONE 0x010 /* If not present */ +#define _PAGE_RW 0x020 +#define _PAGE_USER 0x040 +#define _PAGE_ACCESSED 0x080 +#define _PAGE_DIRTY 0x100 #define REGION_MASK 0xf0000000 #define REGION_SHIFT 28 @@ -202,6 +203,16 @@ #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) +/* + * Bits 0 through 3 are taken + */ +#define PTE_FILE_MAX_BITS 28 + +#define pte_to_pgoff(pte) ((pte).pte_low >> 4) + +#define pgoff_to_pte(off) \ + ((pte_t) { ((off) << 4) + _PAGE_FILE }) + static inline pte_t pte_mknewprot(pte_t pte) { pte_val(pte) |= _PAGE_NEWPROT; @@ -235,6 +246,12 @@ * The following only work if pte_present() is true. * Undefined behaviour if not.. */ +static inline int pte_user(pte_t pte) +{ + return((pte_val(pte) & _PAGE_USER) && + !(pte_val(pte) & _PAGE_PROTNONE)); +} + static inline int pte_read(pte_t pte) { return((pte_val(pte) & _PAGE_USER) && @@ -252,6 +269,14 @@ !(pte_val(pte) & _PAGE_PROTNONE)); } +/* + * The following only works if pte_present() is not true. + */ +static inline int pte_file(pte_t pte) +{ + return (pte).pte_low & _PAGE_FILE; +} + static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; } @@ -354,14 +379,26 @@ #define pmd_page(pmd) (phys_mem_map(pmd_val(pmd) & PAGE_MASK) + \ ((phys_addr(pmd_val(pmd)) >> PAGE_SHIFT))) -/* to find an entry in a page-table-directory. */ +/* + * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] + * + * this macro returns the index of the entry in the pgd page which would + * control the given virtual address + */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) -/* to find an entry in a page-table-directory */ +/* + * pgd_offset() returns a (pgd_t *) + * pgd_index() is used get the offset into the pgd page's array of pgd_t's; + */ #define pgd_offset(mm, address) \ ((mm)->pgd + ((address) >> PGDIR_SHIFT)) -/* to find an entry in a kernel page-table-directory */ + +/* + * a shortcut which implies the use of the kernel's pgd, instead + * of a process's + */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) #define pmd_index(address) \ @@ -373,7 +410,12 @@ return (pmd_t *) dir; } -/* Find an entry in the third-level page table.. */ +/* + * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] + * + * this macro returns the index of the entry in the pte page which would + * control the given virtual address + */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) @@ -399,11 +441,11 @@ #define update_mmu_cache(vma,address,pte) do ; while (0) /* Encode and de-code a swap entry */ -#define __swp_type(x) (((x).val >> 3) & 0x7f) -#define __swp_offset(x) ((x).val >> 10) +#define __swp_type(x) (((x).val >> 4) & 0x3f) +#define __swp_offset(x) ((x).val >> 11) #define __swp_entry(type, offset) \ - ((swp_entry_t) { ((type) << 3) | ((offset) << 10) }) + ((swp_entry_t) { ((type) << 4) | ((offset) << 11) }) #define __pte_to_swp_entry(pte) \ ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) diff -Nru a/include/asm-um/pgtable.h~uml-summa.diff b/include/asm-um/pgtable.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/pgtable.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Derived from include/asm-i386/pgtable.h + * Licensed under the GPL + */ + +#ifndef __UM_PGTABLE_H +#define __UM_PGTABLE_H + +#include "linux/sched.h" +#include "asm/processor.h" +#include "asm/page.h" +#include "asm/fixmap.h" + +extern pgd_t swapper_pg_dir[1024]; + +extern void *um_virt_to_phys(struct task_struct *task, unsigned long virt, + pte_t *pte_out); + +/* zero page used for uninitialized stuff */ +extern unsigned long *empty_zero_page; + +#define pgtable_cache_init() do ; while (0) + +/* PMD_SHIFT determines the size of the area a second-level page table can map */ +#define PMD_SHIFT 22 +#define PMD_SIZE (1UL << PMD_SHIFT) +#define PMD_MASK (~(PMD_SIZE-1)) + +/* PGDIR_SHIFT determines what a third-level page table entry can map */ +#define PGDIR_SHIFT 22 +#define PGDIR_SIZE (1UL << PGDIR_SHIFT) +#define PGDIR_MASK (~(PGDIR_SIZE-1)) + +/* + * entries per page directory level: the i386 is two-level, so + * we don't really have any PMD directory physically. + */ +#define PTRS_PER_PTE 1024 +#define PTRS_PER_PMD 1 +#define PTRS_PER_PGD 1024 +#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#define FIRST_USER_PGD_NR 0 + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + +/* + * pgd entries used up by user/kernel: + */ + +#define USER_PGD_PTRS (TASK_SIZE >> PGDIR_SHIFT) +#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS) + +#ifndef __ASSEMBLY__ +/* Just any arbitrary offset to the start of the vmalloc VM area: the + * current 8MB value just means that there will be a 8MB "hole" after the + * physical memory until the kernel virtual memory starts. That means that + * any out-of-bounds memory accesses will hopefully be caught. + * The vmalloc() routines leaves a hole of 4kB between each vmalloced + * area for the same reason. ;) + */ + +extern unsigned long high_physmem; + +#define VMALLOC_OFFSET (__va_space) +#define VMALLOC_START (((unsigned long) high_physmem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1)) + +#ifdef CONFIG_HIGHMEM +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) +#else +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +#endif + +#define _PAGE_PRESENT 0x001 +#define _PAGE_NEWPAGE 0x002 +#define _PAGE_PROTNONE 0x004 /* If not present */ +#define _PAGE_RW 0x008 +#define _PAGE_USER 0x010 +#define _PAGE_ACCESSED 0x020 +#define _PAGE_DIRTY 0x040 +#define _PAGE_NEWPROT 0x080 + +#define REGION_MASK 0xf0000000 +#define REGION_SHIFT 28 + +#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY) + +#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) +#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_COPY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED) +#define PAGE_KERNEL __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED) +#define PAGE_KERNEL_RO __pgprot(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED) + +/* + * The i386 can't do page protection for execute, and considers that the same are read. + * Also, write permissions imply read permissions. This is the closest we can get.. + */ +#define __P000 PAGE_NONE +#define __P001 PAGE_READONLY +#define __P010 PAGE_COPY +#define __P011 PAGE_COPY +#define __P100 PAGE_READONLY +#define __P101 PAGE_READONLY +#define __P110 PAGE_COPY +#define __P111 PAGE_COPY + +#define __S000 PAGE_NONE +#define __S001 PAGE_READONLY +#define __S010 PAGE_SHARED +#define __S011 PAGE_SHARED +#define __S100 PAGE_READONLY +#define __S101 PAGE_READONLY +#define __S110 PAGE_SHARED +#define __S111 PAGE_SHARED + +/* + * Define this if things work differently on an i386 and an i486: + * it will (on an i486) warn about kernel memory accesses that are + * done without a 'verify_area(VERIFY_WRITE,..)' + */ +#undef TEST_VERIFY_AREA + +/* page table for 0-4MB for everybody */ +extern unsigned long pg0[1024]; + +/* + * BAD_PAGETABLE is used when we need a bogus page-table, while + * BAD_PAGE is used for a bogus page. + * + * ZERO_PAGE is a global shared page that is always zero: used + * for zero-mapped memory areas etc.. + */ +extern pte_t __bad_page(void); +extern pte_t * __bad_pagetable(void); + +#define BAD_PAGETABLE __bad_pagetable() +#define BAD_PAGE __bad_page() +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + +/* number of bits that fit into a memory pointer */ +#define BITS_PER_PTR (8*sizeof(unsigned long)) + +/* to align the pointer to a pointer address */ +#define PTR_MASK (~(sizeof(void*)-1)) + +/* sizeof(void*)==1<<SIZEOF_PTR_LOG2 */ +/* 64-bit machines, beware! SRB. */ +#define SIZEOF_PTR_LOG2 2 + +/* to find an entry in a page-table */ +#define PAGE_PTR(address) \ +((unsigned long)(address)>>(PAGE_SHIFT-SIZEOF_PTR_LOG2)&PTR_MASK&~PAGE_MASK) + +#define pte_none(x) !(pte_val(x) & ~_PAGE_NEWPAGE) +#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE)) + +#define pte_clear(xp) do { pte_val(*(xp)) = _PAGE_NEWPAGE; } while (0) + +#define phys_region_index(x) (((x) & REGION_MASK) >> REGION_SHIFT) +#define pte_region_index(x) phys_region_index(pte_val(x)) + +#define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) +#define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) + +#define pmd_newpage(x) (pmd_val(x) & _PAGE_NEWPAGE) +#define pmd_mkuptodate(x) (pmd_val(x) &= ~_PAGE_NEWPAGE) + +/* + * The "pgd_xxx()" functions here are trivial for a folded two-level + * setup: the pgd is never bad, and a pmd always exists (as it's folded + * into the pgd entry) + */ +static inline int pgd_none(pgd_t pgd) { return 0; } +static inline int pgd_bad(pgd_t pgd) { return 0; } +static inline int pgd_present(pgd_t pgd) { return 1; } +static inline void pgd_clear(pgd_t * pgdp) { } + + +#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) + +extern struct page *pte_mem_map(pte_t pte); +extern struct page *phys_mem_map(unsigned long phys); +extern unsigned long phys_to_pfn(unsigned long p); +extern unsigned long pfn_to_phys(unsigned long pfn); + +#define pte_page(x) pfn_to_page(pte_pfn(x)) +#define pte_address(x) (__va(pte_val(x) & PAGE_MASK)) +#define mk_phys(a, r) ((a) + (r << REGION_SHIFT)) +#define phys_addr(p) ((p) & ~REGION_MASK) +#define phys_page(p) (phys_mem_map(p) + ((phys_addr(p)) >> PAGE_SHIFT)) +#define pte_pfn(x) phys_to_pfn(pte_val(x)) +#define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) + +static inline pte_t pte_mknewprot(pte_t pte) +{ + pte_val(pte) |= _PAGE_NEWPROT; + return(pte); +} + +static inline pte_t pte_mknewpage(pte_t pte) +{ + pte_val(pte) |= _PAGE_NEWPAGE; + return(pte); +} + +static inline void set_pte(pte_t *pteptr, pte_t pteval) +{ + /* If it's a swap entry, it needs to be marked _PAGE_NEWPAGE so + * fix_range knows to unmap it. _PAGE_NEWPROT is specific to + * mapped pages. + */ + *pteptr = pte_mknewpage(pteval); + if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); +} + +/* + * (pmds are folded into pgds so this doesn't get actually called, + * but the define is needed for a generic inline function.) + */ +#define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) +#define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) + +/* + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ +static inline int pte_read(pte_t pte) +{ + return((pte_val(pte) & _PAGE_USER) && + !(pte_val(pte) & _PAGE_PROTNONE)); +} + +static inline int pte_exec(pte_t pte){ + return((pte_val(pte) & _PAGE_USER) && + !(pte_val(pte) & _PAGE_PROTNONE)); +} + +static inline int pte_write(pte_t pte) +{ + return((pte_val(pte) & _PAGE_RW) && + !(pte_val(pte) & _PAGE_PROTNONE)); +} + +static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } +static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } +static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; } +static inline int pte_newprot(pte_t pte) +{ + return(pte_present(pte) && (pte_val(pte) & _PAGE_NEWPROT)); +} + +static inline pte_t pte_rdprotect(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_USER; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_exprotect(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_USER; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkclean(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_DIRTY; + return(pte); +} + +static inline pte_t pte_mkold(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_ACCESSED; + return(pte); +} + +static inline pte_t pte_wrprotect(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_RW; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkread(pte_t pte) +{ + pte_val(pte) |= _PAGE_USER; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkexec(pte_t pte) +{ + pte_val(pte) |= _PAGE_USER; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkdirty(pte_t pte) +{ + pte_val(pte) |= _PAGE_DIRTY; + return(pte); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + pte_val(pte) |= _PAGE_ACCESSED; + return(pte); +} + +static inline pte_t pte_mkwrite(pte_t pte) +{ + pte_val(pte) |= _PAGE_RW; + return(pte_mknewprot(pte)); +} + +static inline pte_t pte_mkuptodate(pte_t pte) +{ + pte_val(pte) &= ~_PAGE_NEWPAGE; + if(pte_present(pte)) pte_val(pte) &= ~_PAGE_NEWPROT; + return(pte); +} + +extern unsigned long page_to_phys(struct page *page); + +/* + * Conversion functions: convert a page and protection to a page entry, + * and a page entry and page directory to the page they refer to. + */ + +#define mk_pte(page, pgprot) \ +({ \ + pte_t __pte; \ + \ + pte_val(__pte) = page_to_phys(page) + pgprot_val(pgprot);\ + if(pte_present(__pte)) pte_mknewprot(pte_mknewpage(__pte)); \ + __pte; \ +}) + +static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) +{ + pte_val(pte) = (pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot); + if(pte_present(pte)) pte = pte_mknewpage(pte_mknewprot(pte)); + return pte; +} + +#define pmd_page_kernel(pmd) ((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) +#define pmd_page(pmd) (phys_mem_map(pmd_val(pmd) & PAGE_MASK) + \ + ((phys_addr(pmd_val(pmd)) >> PAGE_SHIFT))) + +/* to find an entry in a page-table-directory. */ +#define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) + +/* to find an entry in a page-table-directory */ +#define pgd_offset(mm, address) \ +((mm)->pgd + ((address) >> PGDIR_SHIFT)) + +/* to find an entry in a kernel page-table-directory */ +#define pgd_offset_k(address) pgd_offset(&init_mm, address) + +#define pmd_index(address) \ + (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) + +/* Find an entry in the second-level page table.. */ +static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) +{ + return (pmd_t *) dir; +} + +/* Find an entry in the third-level page table.. */ +#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +#define pte_offset_kernel(dir, address) \ + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) +#define pte_offset_map(dir, address) \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) +#define pte_offset_map_nested(dir, address) \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) +#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0) +#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) + +#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM4G) +typedef u32 pte_addr_t; +#endif + +#if defined(CONFIG_HIGHPTE) && defined(CONFIG_HIGHMEM64G) +typedef u64 pte_addr_t; +#endif + +#if !defined(CONFIG_HIGHPTE) +typedef pte_t *pte_addr_t; +#endif + +#define update_mmu_cache(vma,address,pte) do ; while (0) + +/* Encode and de-code a swap entry */ +#define __swp_type(x) (((x).val >> 3) & 0x7f) +#define __swp_offset(x) ((x).val >> 10) + +#define __swp_entry(type, offset) \ + ((swp_entry_t) { ((type) << 3) | ((offset) << 10) }) +#define __pte_to_swp_entry(pte) \ + ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) +#define __swp_entry_to_pte(x) ((pte_t) { (x).val }) + +#define kern_addr_valid(addr) (1) + +#include <asm-generic/pgtable.h> + +#endif + +#endif +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/include/asm-um/processor-generic.h b/include/asm-um/processor-generic.h --- a/include/asm-um/processor-generic.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-um/processor-generic.h Fri Oct 31 14:10:53 2003 @@ -11,9 +11,7 @@ struct task_struct; #include "linux/config.h" -#include "linux/signal.h" #include "asm/ptrace.h" -#include "asm/siginfo.h" #include "choose-mode.h" struct mm_struct; @@ -101,13 +99,18 @@ } mm_segment_t; extern struct task_struct *alloc_task_struct(void); -extern void free_task_struct(struct task_struct *task); extern void release_thread(struct task_struct *); extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); extern void dump_thread(struct pt_regs *regs, struct user *u); +extern void prepare_to_copy(struct task_struct *tsk); extern unsigned long thread_saved_pc(struct task_struct *t); + +static inline void mm_copy_segments(struct mm_struct *from_mm, + struct mm_struct *new_mm) +{ +} #define init_stack (init_thread_union.stack) diff -Nru a/include/asm-um/processor-generic.h~uml-summa.diff b/include/asm-um/processor-generic.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/processor-generic.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_PROCESSOR_GENERIC_H +#define __UM_PROCESSOR_GENERIC_H + +struct pt_regs; + +struct task_struct; + +#include "linux/config.h" +#include "linux/signal.h" +#include "asm/ptrace.h" +#include "asm/siginfo.h" +#include "choose-mode.h" + +struct mm_struct; + +#define current_text_addr() ((void *) 0) + +#define cpu_relax() do ; while (0) + +#ifdef CONFIG_MODE_TT +struct proc_tt_mode { + int extern_pid; + int tracing; + int switch_pipe[2]; + int singlestep_syscall; + int vm_seq; +}; +#endif + +#ifdef CONFIG_MODE_SKAS +struct proc_skas_mode { + void *switch_buf; + void *fork_buf; +}; +#endif + +struct thread_struct { + int forking; + unsigned long kernel_stack; + int nsyscalls; + struct pt_regs regs; + unsigned long cr2; + int err; + void *fault_addr; + void *fault_catcher; + struct task_struct *prev_sched; + unsigned long temp_stack; + void *exec_buf; + struct arch_thread arch; + union { +#ifdef CONFIG_MODE_TT + struct proc_tt_mode tt; +#endif +#ifdef CONFIG_MODE_SKAS + struct proc_skas_mode skas; +#endif + } mode; + struct { + int op; + union { + struct { + int pid; + } fork, exec; + struct { + int (*proc)(void *); + void *arg; + } thread; + struct { + void (*proc)(void *); + void *arg; + } cb; + } u; + } request; +}; + +#define INIT_THREAD \ +{ \ + .forking = 0, \ + .kernel_stack = 0, \ + .nsyscalls = 0, \ + .regs = EMPTY_REGS, \ + .cr2 = 0, \ + .err = 0, \ + .fault_addr = NULL, \ + .prev_sched = NULL, \ + .temp_stack = 0, \ + .exec_buf = NULL, \ + .arch = INIT_ARCH_THREAD, \ + .request = { 0 } \ +} + +#define INIT_THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) + +typedef struct { + unsigned long seg; +} mm_segment_t; + +extern struct task_struct *alloc_task_struct(void); +extern void free_task_struct(struct task_struct *task); + +extern void release_thread(struct task_struct *); +extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); +extern void dump_thread(struct pt_regs *regs, struct user *u); + +extern unsigned long thread_saved_pc(struct task_struct *t); + +#define init_stack (init_thread_union.stack) + +/* + * User space process size: 3GB (default). + */ +extern unsigned long task_size; + +#define TASK_SIZE (task_size) + +/* This decides where the kernel will search for a free chunk of vm + * space during mmap's. + */ +#define TASK_UNMAPPED_BASE (0x40000000) + +extern void start_thread(struct pt_regs *regs, unsigned long entry, + unsigned long stack); + +struct cpuinfo_um { + unsigned long loops_per_jiffy; + int ipi_pipe[2]; +}; + +extern struct cpuinfo_um boot_cpu_data; + +#define my_cpu_data cpu_data[smp_processor_id()] + +#ifdef CONFIG_SMP +extern struct cpuinfo_um cpu_data[]; +#define current_cpu_data cpu_data[smp_processor_id()] +#else +#define cpu_data (&boot_cpu_data) +#define current_cpu_data boot_cpu_data +#endif + +#define KSTK_EIP(tsk) (PT_REGS_IP(&tsk->thread.regs)) +#define KSTK_ESP(tsk) (PT_REGS_SP(&tsk->thread.regs)) +#define get_wchan(p) (0) + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/include/asm-um/processor-i386.h b/include/asm-um/processor-i386.h --- a/include/asm-um/processor-i386.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-um/processor-i386.h Fri Oct 31 14:10:53 2003 @@ -6,8 +6,8 @@ #ifndef __UM_PROCESSOR_I386_H #define __UM_PROCESSOR_I386_H -extern int cpu_has_xmm; -extern int cpu_has_cmov; +extern int host_has_xmm; +extern int host_has_cmov; struct arch_thread { unsigned long debugregs[8]; diff -Nru a/include/asm-um/processor-i386.h~uml-summa.diff b/include/asm-um/processor-i386.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/processor-i386.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_PROCESSOR_I386_H +#define __UM_PROCESSOR_I386_H + +extern int cpu_has_xmm; +extern int cpu_has_cmov; + +struct arch_thread { + unsigned long debugregs[8]; + int debugregs_seq; +}; + +#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \ + .debugregs_seq = 0 } + +#include "asm/arch/user.h" + +#include "asm/processor-generic.h" + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/include/asm-um/sections.h b/include/asm-um/sections.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/sections.h Fri Oct 31 14:10:54 2003 @@ -0,0 +1,7 @@ +#ifndef _UM_SECTIONS_H +#define _UM_SECTIONS_H + +/* nothing to see, move along */ +#include <asm-generic/sections.h> + +#endif diff -Nru a/include/asm-um/smp.h b/include/asm-um/smp.h --- a/include/asm-um/smp.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-um/smp.h Fri Oct 31 14:10:53 2003 @@ -10,7 +10,7 @@ extern cpumask_t cpu_online_map; -#define smp_processor_id() (current->thread_info->cpu) +#define smp_processor_id() (current_thread->cpu) #define cpu_logical_map(n) (n) #define cpu_number_map(n) (n) #define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */ @@ -26,6 +26,13 @@ { } +extern inline int any_online_cpu(unsigned int mask) +{ + if (mask & cpu_online_map) + return __ffs(mask & cpu_online_map); + + return -1; +} #endif #endif diff -Nru a/include/asm-um/smp.h~uml-summa.diff b/include/asm-um/smp.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/smp.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,31 @@ +#ifndef __UM_SMP_H +#define __UM_SMP_H + +#ifdef CONFIG_SMP + +#include "linux/config.h" +#include "linux/bitops.h" +#include "asm/current.h" +#include "linux/cpumask.h" + +extern cpumask_t cpu_online_map; + +#define smp_processor_id() (current->thread_info->cpu) +#define cpu_logical_map(n) (n) +#define cpu_number_map(n) (n) +#define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */ +extern int hard_smp_processor_id(void); +#define NO_PROC_ID -1 + +#define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) + +extern int ncpus; +#define cpu_possible(cpu) (cpu < ncpus) + +extern inline void smp_cpus_done(unsigned int maxcpus) +{ +} + +#endif + +#endif diff -Nru a/include/asm-um/system-generic.h b/include/asm-um/system-generic.h --- a/include/asm-um/system-generic.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-um/system-generic.h Fri Oct 31 14:10:54 2003 @@ -23,8 +23,10 @@ extern void block_signals(void); extern void unblock_signals(void); -#define local_save_flags(flags) do { (flags) = get_signals(); } while(0) -#define local_irq_restore(flags) do { set_signals(flags); } while(0) +#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ + (flags) = get_signals(); } while(0) +#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ + set_signals(flags); } while(0) #define local_irq_save(flags) do { local_save_flags(flags); \ local_irq_disable(); } while(0) @@ -38,5 +40,8 @@ local_save_flags(flags); \ (flags == 0); \ }) + +extern void *_switch_to(void *prev, void *next, void *last); +#define switch_to(prev, next, last) prev = _switch_to(prev, next, last) #endif diff -Nru a/include/asm-um/system-generic.h~uml-summa.diff b/include/asm-um/system-generic.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/system-generic.h~uml-summa.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,42 @@ +#ifndef __UM_SYSTEM_GENERIC_H +#define __UM_SYSTEM_GENERIC_H + +#include "asm/arch/system.h" + +#undef switch_to +#undef local_irq_save +#undef local_irq_restore +#undef local_irq_disable +#undef local_irq_enable +#undef local_save_flags +#undef local_irq_restore +#undef local_irq_enable +#undef local_irq_disable +#undef local_irq_save +#undef irqs_disabled + +extern void *switch_to(void *prev, void *next, void *last); + +extern int get_signals(void); +extern int set_signals(int enable); +extern int get_signals(void); +extern void block_signals(void); +extern void unblock_signals(void); + +#define local_save_flags(flags) do { (flags) = get_signals(); } while(0) +#define local_irq_restore(flags) do { set_signals(flags); } while(0) + +#define local_irq_save(flags) do { local_save_flags(flags); \ + local_irq_disable(); } while(0) + +#define local_irq_enable() unblock_signals() +#define local_irq_disable() block_signals() + +#define irqs_disabled() \ +({ \ + unsigned long flags; \ + local_save_flags(flags); \ + (flags == 0); \ +}) + +#endif diff -Nru a/include/asm-um/thread_info.h b/include/asm-um/thread_info.h --- a/include/asm-um/thread_info.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-um/thread_info.h Fri Oct 31 14:10:53 2003 @@ -9,6 +9,7 @@ #ifndef __ASSEMBLY__ #include <asm/processor.h> +#include <asm/types.h> struct thread_info { struct task_struct *task; /* main task structure */ @@ -20,6 +21,7 @@ mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user 0-0xFFFFFFFF for kernel */ + struct restart_block restart_block; }; @@ -43,15 +45,18 @@ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~16383UL)); + unsigned long mask = PAGE_SIZE * + (1 << CONFIG_KERNEL_STACK_ORDER) - 1; + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~mask)); return ti; } /* thread information allocation */ -#define THREAD_SIZE (4*PAGE_SIZE) -#define alloc_thread_info(tsk) ((struct thread_info *) \ - __get_free_pages(GFP_KERNEL,2)) -#define free_thread_info(ti) free_pages((unsigned long) (ti), 2) +#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) +#define alloc_thread_info(tsk) \ + ((struct thread_info *) kmalloc(THREAD_SIZE, GFP_KERNEL)) +#define free_thread_info(ti) kfree(ti) + #define get_thread_info(ti) get_task_struct((ti)->task) #define put_thread_info(ti) put_task_struct((ti)->task) @@ -65,11 +70,13 @@ #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling * TIF_NEED_RESCHED */ +#define TIF_RESTART_BLOCK 4 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) +#define _TIF_RESTART_BLOCK (1 << TIF_RESTART_BLOCK) #endif diff -Nru a/include/asm-um/thread_info.h~uml-summa.diff b/include/asm-um/thread_info.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/thread_info.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_THREAD_INFO_H +#define __UM_THREAD_INFO_H + +#ifndef __ASSEMBLY__ + +#include <asm/processor.h> + +struct thread_info { + struct task_struct *task; /* main task structure */ + struct exec_domain *exec_domain; /* execution domain */ + unsigned long flags; /* low level flags */ + __u32 cpu; /* current CPU */ + __s32 preempt_count; /* 0 => preemptable, + <0 => BUG */ + mm_segment_t addr_limit; /* thread address space: + 0-0xBFFFFFFF for user + 0-0xFFFFFFFF for kernel */ + struct restart_block restart_block; +}; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + task: &tsk, \ + exec_domain: &default_exec_domain, \ + flags: 0, \ + cpu: 0, \ + preempt_count: 1, \ + addr_limit: KERNEL_DS, \ + restart_block: { \ + fn: do_no_restart_syscall, \ + }, \ +} + +#define init_thread_info (init_thread_union.thread_info) +#define init_stack (init_thread_union.stack) + +/* how to get the thread information struct from C */ +static inline struct thread_info *current_thread_info(void) +{ + struct thread_info *ti; + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~16383UL)); + return ti; +} + +/* thread information allocation */ +#define THREAD_SIZE (4*PAGE_SIZE) +#define alloc_thread_info(tsk) ((struct thread_info *) \ + __get_free_pages(GFP_KERNEL,2)) +#define free_thread_info(ti) free_pages((unsigned long) (ti), 2) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + +#endif + +#define PREEMPT_ACTIVE 0x4000000 + +#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ +#define TIF_SIGPENDING 1 /* signal pending */ +#define TIF_NEED_RESCHED 2 /* rescheduling necessary */ +#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling + * TIF_NEED_RESCHED + */ + +#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) +#define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) + +#endif + +/* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically + * adjust the settings for this buffer only. This must remain at the end + * of the file. + * --------------------------------------------------------------------------- + * Local variables: + * c-file-style: "linux" + * End: + */ diff -Nru a/include/asm-um/timex.h b/include/asm-um/timex.h --- a/include/asm-um/timex.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-um/timex.h Fri Oct 31 14:10:54 2003 @@ -1,8 +1,6 @@ #ifndef __UM_TIMEX_H #define __UM_TIMEX_H -#include "linux/time.h" - typedef unsigned long cycles_t; #define cacheflush_time (0) diff -Nru a/include/asm-um/timex.h~uml-summa.diff b/include/asm-um/timex.h~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/asm-um/timex.h~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,17 @@ +#ifndef __UM_TIMEX_H +#define __UM_TIMEX_H + +#include "linux/time.h" + +typedef unsigned long cycles_t; + +#define cacheflush_time (0) + +static inline cycles_t get_cycles (void) +{ + return 0; +} + +#define CLOCK_TICK_RATE (HZ) + +#endif diff -Nru a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h --- a/include/asm-x86_64/hw_irq.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-x86_64/hw_irq.h Fri Oct 31 14:10:54 2003 @@ -76,8 +76,8 @@ #ifndef __ASSEMBLY__ -extern int irq_vector[NR_IRQS]; -#define IO_APIC_VECTOR(irq) irq_vector[irq] +extern u8 irq_vector[NR_IRQ_VECTORS]; +#define IO_APIC_VECTOR(irq) ((int)irq_vector[irq]) /* * Various low-level irq details needed by irq.c, process.c, diff -Nru a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h --- a/include/asm-x86_64/irq.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-x86_64/irq.h Fri Oct 31 14:10:53 2003 @@ -22,6 +22,7 @@ * the usable vector space is 0x20-0xff (224 vectors) */ #define NR_IRQS 224 +#define NR_IRQ_VECTORS NR_IRQS static __inline__ int irq_canonicalize(int irq) { diff -Nru a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h --- a/include/asm-x86_64/pci.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-x86_64/pci.h Fri Oct 31 14:10:53 2003 @@ -24,6 +24,8 @@ #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM (pci_mem_start) +#define PCIBIOS_MIN_CARDBUS_IO 0x4000 + void pcibios_config_init(void); struct pci_bus * pcibios_scan_root(int bus); extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int len, u32 *value); diff -Nru a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h --- a/include/asm-x86_64/processor.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-x86_64/processor.h Fri Oct 31 14:10:54 2003 @@ -263,8 +263,8 @@ #define DOUBLEFAULT_STACK 2 #define NMI_STACK 3 #define N_EXCEPTION_STACKS 3 /* hw limit: 7 */ -#define EXCEPTION_STKSZ 1024 -#define EXCEPTION_STK_ORDER 0 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) +#define EXCEPTION_STACK_ORDER 0 #define start_thread(regs,new_rip,new_rsp) do { \ asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \ diff -Nru a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h --- a/include/asm-x86_64/smp.h Fri Oct 31 14:10:53 2003 +++ b/include/asm-x86_64/smp.h Fri Oct 31 14:10:53 2003 @@ -74,15 +74,7 @@ return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); } -extern int slow_smp_processor_id(void); - -extern inline int safe_smp_processor_id(void) -{ - if (disable_apic) - return slow_smp_processor_id(); - else - return hard_smp_processor_id(); -} +#define safe_smp_processor_id() (cpuid_ebx(1) >> 24) #define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) #endif /* !ASSEMBLY */ diff -Nru a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h --- a/include/asm-x86_64/topology.h Fri Oct 31 14:10:54 2003 +++ b/include/asm-x86_64/topology.h Fri Oct 31 14:10:54 2003 @@ -10,13 +10,15 @@ /* Map the K8 CPU local memory controllers to a simple 1:1 CPU:NODE topology */ extern int fake_node; +/* This is actually a cpumask_t, but doesn't matter because we don't have + >BITS_PER_LONG CPUs */ extern unsigned long cpu_online_map; #define cpu_to_node(cpu) (fake_node ? 0 : (cpu)) #define memblk_to_node(memblk) (fake_node ? 0 : (memblk)) #define parent_node(node) (node) #define node_to_first_cpu(node) (fake_node ? 0 : (node)) -#define node_to_cpu_mask(node) (fake_node ? cpu_online_map : (1UL << (node))) +#define node_to_cpumask(node) (fake_node ? cpu_online_map : (1UL << (node))) #define node_to_memblk(node) (node) static inline unsigned long pcibus_to_cpumask(int bus) diff -Nru a/include/linux/fs.h b/include/linux/fs.h --- a/include/linux/fs.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/fs.h Fri Oct 31 14:10:53 2003 @@ -871,6 +871,7 @@ void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); + void (*sync_inodes) (struct super_block *, struct writeback_control * wbc); int (*show_options)(struct seq_file *, struct vfsmount *); }; @@ -1247,6 +1248,7 @@ extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern void generic_delete_inode(struct inode *inode); +extern void generic_forget_inode(struct inode *inode); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); diff -Nru a/include/linux/fs.h~export-generic_forget_inode.diff b/include/linux/fs.h~export-generic_forget_inode.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/fs.h~export-generic_forget_inode.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,1413 @@ +#ifndef _LINUX_FS_H +#define _LINUX_FS_H + +/* + * This file has definitions for some important file table + * structures etc. + */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <linux/limits.h> +#include <linux/wait.h> +#include <linux/types.h> +#include <linux/kdev_t.h> +#include <linux/ioctl.h> +#include <linux/list.h> +#include <linux/dcache.h> +#include <linux/stat.h> +#include <linux/cache.h> +#include <linux/radix-tree.h> +#include <linux/kobject.h> +#include <asm/atomic.h> + +struct iovec; +struct nameidata; +struct pipe_inode_info; +struct poll_table_struct; +struct kstatfs; +struct vm_area_struct; +struct vfsmount; + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */ +#define INR_OPEN 1024 /* Initial setting for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) + +/* And dynamically-tunable limits and defaults: */ +struct files_stat_struct { + int nr_files; /* read only */ + int nr_free_files; /* read only */ + int max_files; /* tunable */ +}; +extern struct files_stat_struct files_stat; + +struct inodes_stat_t { + int nr_inodes; + int nr_unused; + int dummy[5]; +}; +extern struct inodes_stat_t inodes_stat; + +extern int leases_enable, dir_notify_enable, lease_break_time; + +#define NR_FILE 8192 /* this can well be larger on a larger system */ +#define NR_RESERVED_FILES 10 /* reserved for root */ +#define NR_SUPER 256 + +#define MAY_EXEC 1 +#define MAY_WRITE 2 +#define MAY_READ 4 +#define MAY_APPEND 8 + +#define FMODE_READ 1 +#define FMODE_WRITE 2 + +#define RW_MASK 1 +#define RWA_MASK 2 +#define READ 0 +#define WRITE 1 +#define READA 2 /* read-ahead - don't block if no resources */ +#define SPECIAL 4 /* For non-blockdevice requests in request queue */ + +#define SEL_IN 1 +#define SEL_OUT 2 +#define SEL_EX 4 + +/* public flags for file_system_type */ +#define FS_REQUIRES_DEV 1 +#define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ +#define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon + * as nfs_rename() will be cleaned up + */ +/* + * These are the fs-independent mount-flags: up to 32 flags are supported + */ +#define MS_RDONLY 1 /* Mount read-only */ +#define MS_NOSUID 2 /* Ignore suid and sgid bits */ +#define MS_NODEV 4 /* Disallow access to device special files */ +#define MS_NOEXEC 8 /* Disallow program execution */ +#define MS_SYNCHRONOUS 16 /* Writes are synced at once */ +#define MS_REMOUNT 32 /* Alter flags of a mounted FS */ +#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ +#define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#define MS_NOATIME 1024 /* Do not update access times. */ +#define MS_NODIRATIME 2048 /* Do not update directory access times */ +#define MS_BIND 4096 +#define MS_MOVE 8192 +#define MS_REC 16384 +#define MS_VERBOSE 32768 +#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ +#define MS_ONE_SECOND (1<<17) /* fs has 1 sec a/m/ctime resolution */ +#define MS_ACTIVE (1<<30) +#define MS_NOUSER (1<<31) + +/* + * Superblock flags that can be altered by MS_REMOUNT + */ +#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\ + MS_NODIRATIME) + +/* + * Old magic mount flag and mask + */ +#define MS_MGC_VAL 0xC0ED0000 +#define MS_MGC_MSK 0xffff0000 + +/* Inode flags - they have nothing to superblock flags now */ + +#define S_SYNC 1 /* Writes are synced at once */ +#define S_NOATIME 2 /* Do not update access times */ +#define S_QUOTA 4 /* Quota initialized for file */ +#define S_APPEND 8 /* Append-only file */ +#define S_IMMUTABLE 16 /* Immutable file */ +#define S_DEAD 32 /* removed, but still open directory */ +#define S_NOQUOTA 64 /* Inode is not counted to quota */ +#define S_DIRSYNC 128 /* Directory modifications are synchronous */ + +/* + * Note that nosuid etc flags are inode-specific: setting some file-system + * flags just means all the inodes inherit those flags by default. It might be + * possible to override it selectively if you really wanted to with some + * ioctl() that is not currently implemented. + * + * Exception: MS_RDONLY is always applied to the entire file system. + * + * Unfortunately, it is possible to change a filesystems flags with it mounted + * with files in use. This means that all of the inodes will not have their + * i_flags updated. Hence, i_flags no longer inherit the superblock mount + * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org + */ +#define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) + +#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) +#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ + ((inode)->i_flags & S_SYNC)) +#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ + ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) +#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) + +#define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA) +#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) +#define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) +#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) +#define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) +#define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) +#define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) +#define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND) + +#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) + +/* the read-only stuff doesn't really belong here, but any other place is + probably as bad and I don't want to create yet another include file. */ + +#define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ +#define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ +#define BLKRRPART _IO(0x12,95) /* re-read partition table */ +#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ +#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ +#define BLKRASET _IO(0x12,98) /* set read ahead for block device */ +#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ +#define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ +#define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ +#define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ +#define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ +#define BLKSSZGET _IO(0x12,104)/* get block device sector size */ +#if 0 +#define BLKPG _IO(0x12,105)/* See blkpg.h */ + +/* Some people are morons. Do not use sizeof! */ + +#define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ +#define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ +/* This was here just to show that the number is taken - + probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ +#endif +/* A jump here: 108-111 have been used for various private purposes. */ +#define BLKBSZGET _IOR(0x12,112,size_t) +#define BLKBSZSET _IOW(0x12,113,size_t) +#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ + +#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ +#define FIBMAP _IO(0x00,1) /* bmap access */ +#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ + +#ifdef __KERNEL__ + +#include <asm/semaphore.h> +#include <asm/byteorder.h> + +/* Used to be a macro which just called the function, now just a function */ +extern void update_atime (struct inode *); + +extern void inode_init(unsigned long); +extern void mnt_init(unsigned long); +extern void files_init(unsigned long); + +struct buffer_head; +typedef int (get_block_t)(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +typedef int (get_blocks_t)(struct inode *inode, sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, int create); +typedef void (dio_iodone_t)(struct inode *inode, loff_t offset, + ssize_t bytes, void *private); + +/* + * Attribute flags. These should be or-ed together to figure out what + * has been changed! + */ +#define ATTR_MODE 1 +#define ATTR_UID 2 +#define ATTR_GID 4 +#define ATTR_SIZE 8 +#define ATTR_ATIME 16 +#define ATTR_MTIME 32 +#define ATTR_CTIME 64 +#define ATTR_ATIME_SET 128 +#define ATTR_MTIME_SET 256 +#define ATTR_FORCE 512 /* Not a change, but a change it */ +#define ATTR_ATTR_FLAG 1024 +#define ATTR_KILL_SUID 2048 +#define ATTR_KILL_SGID 4096 + +/* + * This is the Inode Attributes structure, used for notify_change(). It + * uses the above definitions as flags, to know which values have changed. + * Also, in this manner, a Filesystem can look at only the values it cares + * about. Basically, these are the attributes that the VFS layer can + * request to change from the FS layer. + * + * Derek Atkins <warlord@MIT.EDU> 94-10-20 + */ +struct iattr { + unsigned int ia_valid; + umode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + struct timespec ia_atime; + struct timespec ia_mtime; + struct timespec ia_ctime; + unsigned int ia_attr_flags; +}; + +/* + * This is the inode attributes flag definitions + */ +#define ATTR_FLAG_SYNCRONOUS 1 /* Syncronous write */ +#define ATTR_FLAG_NOATIME 2 /* Don't update atime */ +#define ATTR_FLAG_APPEND 4 /* Append-only file */ +#define ATTR_FLAG_IMMUTABLE 8 /* Immutable file */ +#define ATTR_FLAG_NODIRATIME 16 /* Don't update atime for directory */ + +/* + * Includes for diskquotas. + */ +#include <linux/quota.h> + +/* + * oh the beauties of C type declarations. + */ +struct page; +struct address_space; +struct writeback_control; +struct kiocb; + +struct address_space_operations { + int (*writepage)(struct page *page, struct writeback_control *wbc); + int (*readpage)(struct file *, struct page *); + int (*sync_page)(struct page *); + + /* Write back some dirty pages from this mapping. */ + int (*writepages)(struct address_space *, struct writeback_control *); + + /* Set a page dirty */ + int (*set_page_dirty)(struct page *page); + + int (*readpages)(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + + /* + * ext3 requires that a successful prepare_write() call be followed + * by a commit_write() call - they must be balanced + */ + int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); + int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ + sector_t (*bmap)(struct address_space *, sector_t); + int (*invalidatepage) (struct page *, unsigned long); + int (*releasepage) (struct page *, int); + int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, + loff_t offset, unsigned long nr_segs); +}; + +struct backing_dev_info; +struct address_space { + struct inode *host; /* owner: inode, block_device */ + struct radix_tree_root page_tree; /* radix tree of all pages */ + spinlock_t page_lock; /* and spinlock protecting it */ + struct list_head clean_pages; /* list of clean pages */ + struct list_head dirty_pages; /* list of dirty pages */ + struct list_head locked_pages; /* list of locked pages */ + struct list_head io_pages; /* being prepared for I/O */ + unsigned long nrpages; /* number of total pages */ + struct address_space_operations *a_ops; /* methods */ + struct list_head i_mmap; /* list of private mappings */ + struct list_head i_mmap_shared; /* list of shared mappings */ + struct semaphore i_shared_sem; /* protect both above lists */ + atomic_t truncate_count; /* Cover race condition with truncate */ + unsigned long dirtied_when; /* jiffies of first page dirtying */ + unsigned long flags; /* error bits/gfp mask */ + struct backing_dev_info *backing_dev_info; /* device readahead, etc */ + spinlock_t private_lock; /* for use by the address_space */ + struct list_head private_list; /* ditto */ + struct address_space *assoc_mapping; /* ditto */ +}; + +struct block_device { + dev_t bd_dev; /* not a kdev_t - it's a search key */ + struct inode * bd_inode; /* will die */ + int bd_openers; + struct semaphore bd_sem; /* open/close mutex */ + struct list_head bd_inodes; + void * bd_holder; + int bd_holders; + struct block_device * bd_contains; + unsigned bd_block_size; + struct hd_struct * bd_part; + unsigned bd_part_count; + int bd_invalidated; + struct gendisk * bd_disk; + struct list_head bd_list; +}; + +/* + * Use sequence counter to get consistent i_size on 32-bit processors. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#include <linux/seqlock.h> +#define __NEED_I_SIZE_ORDERED +#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) +#else +#define i_size_ordered_init(inode) do { } while (0) +#endif + +struct inode { + struct hlist_node i_hash; + struct list_head i_list; + struct list_head i_dentry; + unsigned long i_ino; + atomic_t i_count; + umode_t i_mode; + unsigned int i_nlink; + uid_t i_uid; + gid_t i_gid; + dev_t i_rdev; + loff_t i_size; + struct timespec i_atime; + struct timespec i_mtime; + struct timespec i_ctime; + unsigned int i_blkbits; + unsigned long i_blksize; + unsigned long i_version; + unsigned long i_blocks; + unsigned short i_bytes; + spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ + struct semaphore i_sem; + struct inode_operations *i_op; + struct file_operations *i_fop; /* former ->i_op->default_file_ops */ + struct super_block *i_sb; + struct file_lock *i_flock; + struct address_space *i_mapping; + struct address_space i_data; + struct dquot *i_dquot[MAXQUOTAS]; + /* These three should probably be a union */ + struct list_head i_devices; + struct pipe_inode_info *i_pipe; + struct block_device *i_bdev; + struct cdev *i_cdev; + int i_cindex; + + unsigned long i_dnotify_mask; /* Directory notify events */ + struct dnotify_struct *i_dnotify; /* for directory notifications */ + + unsigned long i_state; + + unsigned int i_flags; + unsigned char i_sock; + + atomic_t i_writecount; + void *i_security; + __u32 i_generation; + union { + void *generic_ip; + } u; +#ifdef __NEED_I_SIZE_ORDERED + seqcount_t i_size_seqcount; +#endif +}; + +/* + * NOTE: in a 32bit arch with a preemptable kernel and + * an UP compile the i_size_read/write must be atomic + * with respect to the local cpu (unlike with preempt disabled), + * but they don't need to be atomic with respect to other cpus like in + * true SMP (so they need either to either locally disable irq around + * the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles + * and 64bit archs it makes no difference if preempt is enabled or not. + */ +static inline loff_t i_size_read(struct inode *inode) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + loff_t i_size; + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + i_size = inode->i_size; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); + return i_size; +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) + loff_t i_size; + + preempt_disable(); + i_size = inode->i_size; + preempt_enable(); + return i_size; +#else + return inode->i_size; +#endif +} + + +static inline void i_size_write(struct inode *inode, loff_t i_size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + inode->i_size = i_size; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) + preempt_disable(); + inode->i_size = i_size; + preempt_enable(); +#else + inode->i_size = i_size; +#endif +} + +static inline unsigned iminor(struct inode *inode) +{ + return MINOR(inode->i_rdev); +} + +static inline unsigned imajor(struct inode *inode) +{ + return MAJOR(inode->i_rdev); +} + +struct fown_struct { + rwlock_t lock; /* protects pid, uid, euid fields */ + int pid; /* pid or -pgrp where SIGIO should be sent */ + uid_t uid, euid; /* uid/euid of process setting the owner */ + int signum; /* posix.1b rt signal to be delivered on IO */ + void *security; +}; + +/* + * Track a single file's readahead state + */ +struct file_ra_state { + unsigned long start; /* Current window */ + unsigned long size; + unsigned long next_size; /* Next window size */ + unsigned long prev_page; /* Cache last read() position */ + unsigned long ahead_start; /* Ahead window */ + unsigned long ahead_size; + unsigned long ra_pages; /* Maximum readahead window */ + unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ + unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ +}; + +struct file { + struct list_head f_list; + struct dentry *f_dentry; + struct vfsmount *f_vfsmnt; + struct file_operations *f_op; + atomic_t f_count; + unsigned int f_flags; + mode_t f_mode; + loff_t f_pos; + struct fown_struct f_owner; + unsigned int f_uid, f_gid; + int f_error; + struct file_ra_state f_ra; + + unsigned long f_version; + void *f_security; + + /* needed for tty driver, and maybe others */ + void *private_data; + + /* Used by fs/eventpoll.c to link all the hooks to this file */ + struct list_head f_ep_links; + spinlock_t f_ep_lock; +}; +extern spinlock_t files_lock; +#define file_list_lock() spin_lock(&files_lock); +#define file_list_unlock() spin_unlock(&files_lock); + +#define get_file(x) atomic_inc(&(x)->f_count) +#define file_count(x) atomic_read(&(x)->f_count) + +/* Initialize and open a private file and allocate its security structure. */ +extern int open_private_file(struct file *, struct dentry *, int); +/* Release a private file and free its security structure. */ +extern void close_private_file(struct file *file); + +#define MAX_NON_LFS ((1UL<<31) - 1) + +/* Page cache limit. The filesystems should put that into their s_maxbytes + limits, otherwise bad things can happen in VM. */ +#if BITS_PER_LONG==32 +#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) +#elif BITS_PER_LONG==64 +#define MAX_LFS_FILESIZE 0x7fffffffffffffff +#endif + +#define FL_POSIX 1 +#define FL_FLOCK 2 +#define FL_ACCESS 8 /* not trying to lock, just looking */ +#define FL_LOCKD 16 /* lock held by rpc.lockd */ +#define FL_LEASE 32 /* lease held on this file */ +#define FL_SLEEP 128 /* A blocking lock */ + +/* + * The POSIX file lock owner is determined by + * the "struct files_struct" in the thread group + * (or NULL for no owner - BSD locks). + * + * Lockd stuffs a "host" pointer into this. + */ +typedef struct files_struct *fl_owner_t; + +/* that will die - we need it for nfs_lock_info */ +#include <linux/nfs_fs_i.h> + +struct file_lock { + struct file_lock *fl_next; /* singly linked list for this inode */ + struct list_head fl_link; /* doubly linked list of all locks */ + struct list_head fl_block; /* circular list of blocked processes */ + fl_owner_t fl_owner; + unsigned int fl_pid; + wait_queue_head_t fl_wait; + struct file *fl_file; + unsigned char fl_flags; + unsigned char fl_type; + loff_t fl_start; + loff_t fl_end; + + void (*fl_notify)(struct file_lock *); /* unblock callback */ + void (*fl_insert)(struct file_lock *); /* lock insertion callback */ + void (*fl_remove)(struct file_lock *); /* lock removal callback */ + + struct fasync_struct * fl_fasync; /* for lease break notifications */ + unsigned long fl_break_time; /* for nonblocking lease breaks */ + + union { + struct nfs_lock_info nfs_fl; + } fl_u; +}; + +/* The following constant reflects the upper bound of the file/locking space */ +#ifndef OFFSET_MAX +#define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) +#define OFFSET_MAX INT_LIMIT(loff_t) +#define OFFT_OFFSET_MAX INT_LIMIT(off_t) +#endif + +extern struct list_head file_lock_list; + +#include <linux/fcntl.h> + +extern int fcntl_getlk(struct file *, struct flock __user *); +extern int fcntl_setlk(struct file *, unsigned int, struct flock __user *); + +#if BITS_PER_LONG == 32 +extern int fcntl_getlk64(struct file *, struct flock64 __user *); +extern int fcntl_setlk64(struct file *, unsigned int, struct flock64 __user *); +#endif + +extern void send_sigio(struct fown_struct *fown, int fd, int band); +extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); +extern int fcntl_getlease(struct file *filp); + +/* fs/locks.c */ +extern void locks_init_lock(struct file_lock *); +extern void locks_copy_lock(struct file_lock *, struct file_lock *); +extern void locks_remove_posix(struct file *, fl_owner_t); +extern void locks_remove_flock(struct file *); +extern struct file_lock *posix_test_lock(struct file *, struct file_lock *); +extern int posix_lock_file(struct file *, struct file_lock *); +extern void posix_block_lock(struct file_lock *, struct file_lock *); +extern void posix_unblock_lock(struct file *, struct file_lock *); +extern int posix_locks_deadlock(struct file_lock *, struct file_lock *); +extern int __break_lease(struct inode *inode, unsigned int flags); +extern void lease_get_mtime(struct inode *, struct timespec *time); +extern int lock_may_read(struct inode *, loff_t start, unsigned long count); +extern int lock_may_write(struct inode *, loff_t start, unsigned long count); + +struct fasync_struct { + int magic; + int fa_fd; + struct fasync_struct *fa_next; /* singly linked list */ + struct file *fa_file; +}; + +#define FASYNC_MAGIC 0x4601 + +/* SMP safe fasync helpers: */ +extern int fasync_helper(int, struct file *, int, struct fasync_struct **); +/* can be called from interrupts */ +extern void kill_fasync(struct fasync_struct **, int, int); +/* only for net: no internal synchronization */ +extern void __kill_fasync(struct fasync_struct *, int, int); + +extern int f_setown(struct file *filp, unsigned long arg, int force); +extern void f_delown(struct file *filp); +extern int send_sigurg(struct fown_struct *fown); + +/* + * Umount options + */ + +#define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ +#define MNT_DETACH 0x00000002 /* Just detach from the tree */ + +extern struct list_head super_blocks; +extern spinlock_t sb_lock; + +#define sb_entry(list) list_entry((list), struct super_block, s_list) +#define S_BIAS (1<<30) +struct super_block { + struct list_head s_list; /* Keep this first */ + dev_t s_dev; /* search index; _not_ kdev_t */ + unsigned long s_blocksize; + unsigned long s_old_blocksize; + unsigned char s_blocksize_bits; + unsigned char s_dirt; + unsigned long long s_maxbytes; /* Max file size */ + struct file_system_type *s_type; + struct super_operations *s_op; + struct dquot_operations *dq_op; + struct quotactl_ops *s_qcop; + struct export_operations *s_export_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; + struct rw_semaphore s_umount; + struct semaphore s_lock; + int s_count; + int s_syncing; + int s_need_sync_fs; + atomic_t s_active; + void *s_security; + + struct list_head s_dirty; /* dirty inodes */ + struct list_head s_io; /* parked for writeback */ + struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ + struct list_head s_files; + + struct block_device *s_bdev; + struct list_head s_instances; + struct quota_info s_dquot; /* Diskquota specific options */ + + char s_id[32]; /* Informational name */ + + struct kobject kobj; /* anchor for sysfs */ + void *s_fs_info; /* Filesystem private info */ + + /* + * The next field is for VFS *only*. No filesystems have any business + * even looking at it. You had been warned. + */ + struct semaphore s_vfs_rename_sem; /* Kludge */ +}; + +/* + * Superblock locking. + */ +static inline void lock_super(struct super_block * sb) +{ + down(&sb->s_lock); +} + +static inline void unlock_super(struct super_block * sb) +{ + up(&sb->s_lock); +} + +/* + * VFS helper functions.. + */ +extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); +extern int vfs_mkdir(struct inode *, struct dentry *, int); +extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); +extern int vfs_symlink(struct inode *, struct dentry *, const char *); +extern int vfs_link(struct dentry *, struct inode *, struct dentry *); +extern int vfs_rmdir(struct inode *, struct dentry *); +extern int vfs_unlink(struct inode *, struct dentry *); +extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); + +/* + * File types + * + * NOTE! These match bits 12..15 of stat.st_mode + * (ie "(i_mode >> 12) & 15"). + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +/* + * This is the "filldir" function type, used by readdir() to let + * the kernel specify what kind of dirent layout it wants to have. + * This allows the kernel to read directories into kernel space or + * to have different dirent layouts depending on the binary type. + */ +typedef int (*filldir_t)(void *, const char *, int, loff_t, ino_t, unsigned); + +struct block_device_operations { + int (*open) (struct inode *, struct file *); + int (*release) (struct inode *, struct file *); + int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + int (*media_changed) (struct gendisk *); + int (*revalidate_disk) (struct gendisk *); + struct module *owner; +}; + +/* + * "descriptor" for what we're up to with a read for sendfile(). + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ +typedef struct { + size_t written; + size_t count; + char __user * buf; + int error; +} read_descriptor_t; + +typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); + +/* + * NOTE: + * read, write, poll, fsync, readv, writev can be called + * without the big kernel lock held in all filesystems. + */ +struct file_operations { + struct module *owner; + loff_t (*llseek) (struct file *, loff_t, int); + ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); + ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t); + ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); + ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t); + int (*readdir) (struct file *, void *, filldir_t); + unsigned int (*poll) (struct file *, struct poll_table_struct *); + int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); + int (*mmap) (struct file *, struct vm_area_struct *); + int (*open) (struct inode *, struct file *); + int (*flush) (struct file *); + int (*release) (struct inode *, struct file *); + int (*fsync) (struct file *, struct dentry *, int datasync); + int (*aio_fsync) (struct kiocb *, int datasync); + int (*fasync) (int, struct file *, int); + int (*lock) (struct file *, int, struct file_lock *); + ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void __user *); + ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +}; + +struct inode_operations { + int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); + int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*unlink) (struct inode *,struct dentry *); + int (*symlink) (struct inode *,struct dentry *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); + int (*rmdir) (struct inode *,struct dentry *); + int (*mknod) (struct inode *,struct dentry *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); + int (*readlink) (struct dentry *, char __user *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int, struct nameidata *); + int (*setattr) (struct dentry *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); +}; + +struct seq_file; + +extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); +extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); +extern ssize_t vfs_readv(struct file *, const struct iovec __user *, + unsigned long, loff_t *); +extern ssize_t vfs_writev(struct file *, const struct iovec __user *, + unsigned long, loff_t *); + +/* + * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called + * without the big kernel lock held in all filesystems. + */ +struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *); + + void (*read_inode) (struct inode *); + + void (*dirty_inode) (struct inode *); + void (*write_inode) (struct inode *, int); + void (*put_inode) (struct inode *); + void (*drop_inode) (struct inode *); + void (*delete_inode) (struct inode *); + void (*put_super) (struct super_block *); + void (*write_super) (struct super_block *); + int (*sync_fs)(struct super_block *sb, int wait); + void (*write_super_lockfs) (struct super_block *); + void (*unlockfs) (struct super_block *); + int (*statfs) (struct super_block *, struct kstatfs *); + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); + + void (*sync_inodes) (struct super_block *, struct writeback_control * wbc); + int (*show_options)(struct seq_file *, struct vfsmount *); +}; + +/* Inode state bits. Protected by inode_lock. */ +#define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */ +#define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */ +#define I_DIRTY_PAGES 4 /* Data-related inode changes pending */ +#define I_LOCK 8 +#define I_FREEING 16 +#define I_CLEAR 32 +#define I_NEW 64 + +#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) + +extern void __mark_inode_dirty(struct inode *, int); +static inline void mark_inode_dirty(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY); +} + +static inline void mark_inode_dirty_sync(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY_SYNC); +} + + +/** + * &export_operations - for nfsd to communicate with file systems + * decode_fh: decode a file handle fragment and return a &struct dentry + * encode_fh: encode a file handle fragment from a dentry + * get_name: find the name for a given inode in a given directory + * get_parent: find the parent of a given directory + * get_dentry: find a dentry for the inode given a file handle sub-fragment + * + * Description: + * The export_operations structure provides a means for nfsd to communicate + * with a particular exported file system - particularly enabling nfsd and + * the filesystem to co-operate when dealing with file handles. + * + * export_operations contains two basic operation for dealing with file handles, + * decode_fh() and encode_fh(), and allows for some other operations to be defined + * which standard helper routines use to get specific information from the + * filesystem. + * + * nfsd encodes information use to determine which filesystem a filehandle + * applies to in the initial part of the file handle. The remainder, termed a + * file handle fragment, is controlled completely by the filesystem. + * The standard helper routines assume that this fragment will contain one or two + * sub-fragments, one which identifies the file, and one which may be used to + * identify the (a) directory containing the file. + * + * In some situations, nfsd needs to get a dentry which is connected into a + * specific part of the file tree. To allow for this, it passes the function + * acceptable() together with a @context which can be used to see if the dentry + * is acceptable. As there can be multiple dentrys for a given file, the filesystem + * should check each one for acceptability before looking for the next. As soon + * as an acceptable one is found, it should be returned. + * + * decode_fh: + * @decode_fh is given a &struct super_block (@sb), a file handle fragment (@fh, @fh_len) + * and an acceptability testing function (@acceptable, @context). It should return + * a &struct dentry which refers to the same file that the file handle fragment refers + * to, and which passes the acceptability test. If it cannot, it should return + * a %NULL pointer if the file was found but no acceptable &dentries were available, or + * a %ERR_PTR error code indicating why it couldn't be found (e.g. %ENOENT or %ENOMEM). + * + * encode_fh: + * @encode_fh should store in the file handle fragment @fh (using at most @max_len bytes) + * information that can be used by @decode_fh to recover the file refered to by the + * &struct dentry @de. If the @connectable flag is set, the encode_fh() should store + * sufficient information so that a good attempt can be made to find not only + * the file but also it's place in the filesystem. This typically means storing + * a reference to de->d_parent in the filehandle fragment. + * encode_fh() should return the number of bytes stored or a negative error code + * such as %-ENOSPC + * + * get_name: + * @get_name should find a name for the given @child in the given @parent directory. + * The name should be stored in the @name (with the understanding that it is already + * pointing to a a %NAME_MAX+1 sized buffer. get_name() should return %0 on success, + * a negative error code or error. + * @get_name will be called without @parent->i_sem held. + * + * get_parent: + * @get_parent should find the parent directory for the given @child which is also + * a directory. In the event that it cannot be found, or storage space cannot be + * allocated, a %ERR_PTR should be returned. + * + * get_dentry: + * Given a &super_block (@sb) and a pointer to a file-system specific inode identifier, + * possibly an inode number, (@inump) get_dentry() should find the identified inode and + * return a dentry for that inode. + * Any suitable dentry can be returned including, if necessary, a new dentry created + * with d_alloc_root. The caller can then find any other extant dentrys by following the + * d_alias links. If a new dentry was created using d_alloc_root, DCACHE_NFSD_DISCONNECTED + * should be set, and the dentry should be d_rehash()ed. + * + * If the inode cannot be found, either a %NULL pointer or an %ERR_PTR code can be returned. + * The @inump will be whatever was passed to nfsd_find_fh_dentry() in either the + * @obj or @parent parameters. + * + * Locking rules: + * get_parent is called with child->d_inode->i_sem down + * get_name is not (which is possibly inconsistent) + */ + +struct export_operations { + struct dentry *(*decode_fh)(struct super_block *sb, __u32 *fh, int fh_len, int fh_type, + int (*acceptable)(void *context, struct dentry *de), + void *context); + int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len, + int connectable); + + /* the following are only called from the filesystem itself */ + int (*get_name)(struct dentry *parent, char *name, + struct dentry *child); + struct dentry * (*get_parent)(struct dentry *child); + struct dentry * (*get_dentry)(struct super_block *sb, void *inump); + + /* This is set by the exporting module to a standard helper */ + struct dentry * (*find_exported_dentry)( + struct super_block *sb, void *obj, void *parent, + int (*acceptable)(void *context, struct dentry *de), + void *context); + + +}; + + +struct file_system_type { + const char *name; + int fs_flags; + struct super_block *(*get_sb) (struct file_system_type *, int, + const char *, void *); + void (*kill_sb) (struct super_block *); + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; +}; + +struct super_block *get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct super_block *get_sb_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct super_block *get_sb_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)); +void generic_shutdown_super(struct super_block *sb); +void kill_block_super(struct super_block *sb); +void kill_anon_super(struct super_block *sb); +void kill_litter_super(struct super_block *sb); +void deactivate_super(struct super_block *sb); +int set_anon_super(struct super_block *s, void *data); +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + void *data); +struct super_block *get_sb_pseudo(struct file_system_type *, char *, + struct super_operations *ops, unsigned long); + +/* Alas, no aliases. Too much hassle with bringing module.h everywhere */ +#define fops_get(fops) \ + (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) +#define fops_put(fops) \ + do { if (fops) module_put((fops)->owner); } while(0) + +extern int register_filesystem(struct file_system_type *); +extern int unregister_filesystem(struct file_system_type *); +extern struct vfsmount *kern_mount(struct file_system_type *); +extern int may_umount(struct vfsmount *); +extern long do_mount(char *, char *, char *, unsigned long, void *); + +extern int vfs_statfs(struct super_block *, struct kstatfs *); + +/* Return value for VFS lock functions - tells locks.c to lock conventionally + * REALLY kosha for root NFS and nfs_lock + */ +#define LOCK_USE_CLNT 1 + +#define FLOCK_VERIFY_READ 1 +#define FLOCK_VERIFY_WRITE 2 + +extern int locks_mandatory_locked(struct inode *); +extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); + +/* + * Candidates for mandatory locking have the setgid bit set + * but no group execute bit - an otherwise meaningless combination. + */ +#define MANDATORY_LOCK(inode) \ + (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + +static inline int locks_verify_locked(struct inode *inode) +{ + if (MANDATORY_LOCK(inode)) + return locks_mandatory_locked(inode); + return 0; +} + +static inline int locks_verify_area(int read_write, struct inode *inode, + struct file *filp, loff_t offset, + size_t count) +{ + if (inode->i_flock && MANDATORY_LOCK(inode)) + return locks_mandatory_area(read_write, inode, filp, offset, count); + return 0; +} + +static inline int locks_verify_truncate(struct inode *inode, + struct file *filp, + loff_t size) +{ + if (inode->i_flock && MANDATORY_LOCK(inode)) + return locks_mandatory_area( + FLOCK_VERIFY_WRITE, inode, filp, + size < inode->i_size ? size : inode->i_size, + (size < inode->i_size ? inode->i_size - size + : size - inode->i_size) + ); + return 0; +} + +static inline int break_lease(struct inode *inode, unsigned int mode) +{ + if (inode->i_flock) + return __break_lease(inode, mode); + return 0; +} + +/* fs/open.c */ + +asmlinkage long sys_open(const char __user *, int, int); +asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ +extern int do_truncate(struct dentry *, loff_t start); + +extern struct file *filp_open(const char *, int, int); +extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); +extern int filp_close(struct file *, fl_owner_t id); +extern char * getname(const char __user *); + +/* fs/dcache.c */ +extern void vfs_caches_init(unsigned long); + +#define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL) +#define putname(name) kmem_cache_free(names_cachep, (void *)(name)) + +enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW}; +extern int register_blkdev(unsigned int, const char *); +extern int unregister_blkdev(unsigned int, const char *); +extern struct block_device *bdget(dev_t); +extern int bd_acquire(struct inode *inode); +extern void bd_forget(struct inode *inode); +extern void bdput(struct block_device *); +extern int blkdev_open(struct inode *, struct file *); +extern int blkdev_close(struct inode *, struct file *); +extern struct block_device *open_by_devnum(dev_t, unsigned, int); +extern struct file_operations def_blk_fops; +extern struct address_space_operations def_blk_aops; +extern struct file_operations def_chr_fops; +extern struct file_operations bad_sock_fops; +extern struct file_operations def_fifo_fops; +extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); +extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long); +extern int blkdev_get(struct block_device *, mode_t, unsigned, int); +extern int blkdev_put(struct block_device *, int); +extern int bd_claim(struct block_device *, void *); +extern void bd_release(struct block_device *); +extern void blk_run_queues(void); + +/* fs/char_dev.c */ +extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, char *); +extern int register_chrdev_region(dev_t, unsigned, char *); +extern int register_chrdev(unsigned int, const char *, + struct file_operations *); +extern int unregister_chrdev(unsigned int, const char *); +extern void unregister_chrdev_region(dev_t, unsigned); +extern int chrdev_open(struct inode *, struct file *); + +/* fs/block_dev.c */ +#define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ +extern const char *__bdevname(dev_t, char *buffer); +extern const char *bdevname(struct block_device *bdev, char *buffer); +extern struct block_device *lookup_bdev(const char *); +extern struct block_device *open_bdev_excl(const char *, int, int, void *); +extern void close_bdev_excl(struct block_device *, int); + +extern void init_special_inode(struct inode *, umode_t, dev_t); + +/* Invalid inode operations -- fs/bad_inode.c */ +extern void make_bad_inode(struct inode *); +extern int is_bad_inode(struct inode *); + +extern struct file_operations read_fifo_fops; +extern struct file_operations write_fifo_fops; +extern struct file_operations rdwr_fifo_fops; +extern struct file_operations read_pipe_fops; +extern struct file_operations write_pipe_fops; +extern struct file_operations rdwr_pipe_fops; + +extern int fs_may_remount_ro(struct super_block *); + +/* + * return READ, READA, or WRITE + */ +#define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) + +/* + * return data direction, READ or WRITE + */ +#define bio_data_dir(bio) ((bio)->bi_rw & 1) + +extern int check_disk_change(struct block_device *); +extern int invalidate_inodes(struct super_block *); +extern int __invalidate_device(struct block_device *, int); +extern int invalidate_partition(struct gendisk *, int); +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end); +unsigned long invalidate_inode_pages(struct address_space *mapping); +static inline void invalidate_remote_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + invalidate_inode_pages(inode->i_mapping); +} +extern void invalidate_inode_pages2(struct address_space *mapping); +extern void write_inode_now(struct inode *, int); +extern int filemap_fdatawrite(struct address_space *); +extern int filemap_flush(struct address_space *); +extern int filemap_fdatawait(struct address_space *); +extern void sync_supers(void); +extern void sync_filesystems(int wait); +extern void emergency_sync(void); +extern void emergency_remount(void); +extern int do_remount_sb(struct super_block *sb, int flags, + void *data, int force); +extern sector_t bmap(struct inode *, sector_t); +extern int setattr_mask(unsigned int); +extern int notify_change(struct dentry *, struct iattr *); +extern int permission(struct inode *, int, struct nameidata *); +extern int vfs_permission(struct inode *, int); +extern int get_write_access(struct inode *); +extern int deny_write_access(struct file *); +static inline void put_write_access(struct inode * inode) +{ + atomic_dec(&inode->i_writecount); +} +static inline void allow_write_access(struct file *file) +{ + if (file) + atomic_inc(&file->f_dentry->d_inode->i_writecount); +} +extern int do_pipe(int *); + +extern int open_namei(const char *, int, int, struct nameidata *); +extern int may_open(struct nameidata *, int, int); + +extern int kernel_read(struct file *, unsigned long, char *, unsigned long); +extern struct file * open_exec(const char *); + +/* fs/dcache.c -- generic fs support functions */ +extern int is_subdir(struct dentry *, struct dentry *); +extern ino_t find_inode_number(struct dentry *, struct qstr *); + +#include <linux/err.h> + +/* needed for stackable file system support */ +extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + +extern void inode_init_once(struct inode *); +extern void iput(struct inode *); +extern struct inode * igrab(struct inode *); +extern ino_t iunique(struct super_block *, ino_t); +extern int inode_needs_sync(struct inode *inode); +extern void generic_delete_inode(struct inode *inode); + +extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data); +extern struct inode *ilookup(struct super_block *sb, unsigned long ino); + +extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); +extern struct inode * iget_locked(struct super_block *, unsigned long); +extern void unlock_new_inode(struct inode *); + +static inline struct inode *iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode = iget_locked(sb, ino); + + if (inode && (inode->i_state & I_NEW)) { + sb->s_op->read_inode(inode); + unlock_new_inode(inode); + } + + return inode; +} + +extern void __iget(struct inode * inode); +extern void clear_inode(struct inode *); +extern void destroy_inode(struct inode *); +extern struct inode *new_inode(struct super_block *); +extern void remove_suid(struct dentry *); + +extern void __insert_inode_hash(struct inode *, unsigned long hashval); +extern void remove_inode_hash(struct inode *); +static inline void insert_inode_hash(struct inode *inode) { + __insert_inode_hash(inode, inode->i_ino); +} + +extern struct file * get_empty_filp(void); +extern void file_move(struct file *f, struct list_head *list); +extern void file_kill(struct file *f); +struct bio; +extern int submit_bio(int, struct bio *); +extern int bdev_read_only(struct block_device *); +extern int set_blocksize(struct block_device *, int); +extern int sb_set_blocksize(struct super_block *, int); +extern int sb_min_blocksize(struct super_block *, int); + +extern int generic_file_mmap(struct file *, struct vm_area_struct *); +extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); +extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); +extern int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); +extern ssize_t generic_file_read(struct file *, char __user *, size_t, loff_t *); +int generic_write_checks(struct inode *inode, struct file *file, + loff_t *pos, size_t *count, int isblk); +extern ssize_t generic_file_write(struct file *, const char __user *, size_t, loff_t *); +extern ssize_t generic_file_aio_read(struct kiocb *, char __user *, size_t, loff_t); +extern ssize_t __generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t *); +extern ssize_t generic_file_aio_write(struct kiocb *, const char __user *, size_t, loff_t); +extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, + unsigned long, loff_t *); +extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); +extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); +ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void __user *); +extern void do_generic_mapping_read(struct address_space *, struct file_ra_state *, struct file *, + loff_t *, read_descriptor_t *, read_actor_t); +extern void +file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); +extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs); +extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io); +extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +extern loff_t no_llseek(struct file *file, loff_t offset, int origin); +extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); +extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); +extern int generic_file_open(struct inode * inode, struct file * filp); + +static inline void do_generic_file_read(struct file * filp, loff_t *ppos, + read_descriptor_t * desc, + read_actor_t actor) +{ + do_generic_mapping_read(filp->f_dentry->d_inode->i_mapping, + &filp->f_ra, + filp, + ppos, + desc, + actor); +} + +extern struct file_operations generic_ro_fops; + +#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + +extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +extern int vfs_follow_link(struct nameidata *, const char *); +extern int page_readlink(struct dentry *, char __user *, int); +extern int page_follow_link(struct dentry *, struct nameidata *); +extern int page_symlink(struct inode *inode, const char *symname, int len); +extern struct inode_operations page_symlink_inode_operations; +extern void generic_fillattr(struct inode *, struct kstat *); +extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +void inode_add_bytes(struct inode *inode, loff_t bytes); +void inode_sub_bytes(struct inode *inode, loff_t bytes); +loff_t inode_get_bytes(struct inode *inode); +void inode_set_bytes(struct inode *inode, loff_t bytes); + +extern int vfs_readdir(struct file *, filldir_t, void *); + +extern int vfs_stat(char __user *, struct kstat *); +extern int vfs_lstat(char __user *, struct kstat *); +extern int vfs_fstat(unsigned int, struct kstat *); + +extern struct file_system_type *get_fs_type(const char *name); +extern struct super_block *get_super(struct block_device *); +extern struct super_block *user_get_super(dev_t); +extern void drop_super(struct super_block *sb); + +extern int dcache_dir_open(struct inode *, struct file *); +extern int dcache_dir_close(struct inode *, struct file *); +extern loff_t dcache_dir_lseek(struct file *, loff_t, int); +extern int dcache_readdir(struct file *, void *, filldir_t); +extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int simple_statfs(struct super_block *, struct kstatfs *); +extern int simple_link(struct dentry *, struct inode *, struct dentry *); +extern int simple_unlink(struct inode *, struct dentry *); +extern int simple_rmdir(struct inode *, struct dentry *); +extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int simple_sync_file(struct file *, struct dentry *, int); +extern int simple_empty(struct dentry *); +extern int simple_readpage(struct file *file, struct page *page); +extern int simple_prepare_write(struct file *file, struct page *page, + unsigned offset, unsigned to); +extern int simple_commit_write(struct file *file, struct page *page, + unsigned offset, unsigned to); + +extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); +extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); +extern struct file_operations simple_dir_operations; +extern struct inode_operations simple_dir_inode_operations; +struct tree_descr { char *name; struct file_operations *ops; int mode; }; +extern int simple_fill_super(struct super_block *, int, struct tree_descr *); +extern int simple_pin_fs(char *name, struct vfsmount **mount, int *count); +extern void simple_release_fs(struct vfsmount **mount, int *count); + +extern int inode_change_ok(struct inode *, struct iattr *); +extern int inode_setattr(struct inode *, struct iattr *); + +extern void inode_update_time(struct inode *inode, int ctime_too); + +static inline ino_t parent_ino(struct dentry *dentry) +{ + ino_t res; + + spin_lock(&dentry->d_lock); + res = dentry->d_parent->d_inode->i_ino; + spin_unlock(&dentry->d_lock); + return res; +} + +#endif /* __KERNEL__ */ +#endif /* _LINUX_FS_H */ diff -Nru a/include/linux/fs.h~sb_sync_inodes.diff b/include/linux/fs.h~sb_sync_inodes.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/fs.h~sb_sync_inodes.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,1412 @@ +#ifndef _LINUX_FS_H +#define _LINUX_FS_H + +/* + * This file has definitions for some important file table + * structures etc. + */ + +#include <linux/config.h> +#include <linux/linkage.h> +#include <linux/limits.h> +#include <linux/wait.h> +#include <linux/types.h> +#include <linux/kdev_t.h> +#include <linux/ioctl.h> +#include <linux/list.h> +#include <linux/dcache.h> +#include <linux/stat.h> +#include <linux/cache.h> +#include <linux/radix-tree.h> +#include <linux/kobject.h> +#include <asm/atomic.h> + +struct iovec; +struct nameidata; +struct pipe_inode_info; +struct poll_table_struct; +struct kstatfs; +struct vm_area_struct; +struct vfsmount; + +/* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + +/* Fixed constants first: */ +#undef NR_OPEN +#define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */ +#define INR_OPEN 1024 /* Initial setting for nfile rlimits */ + +#define BLOCK_SIZE_BITS 10 +#define BLOCK_SIZE (1<<BLOCK_SIZE_BITS) + +/* And dynamically-tunable limits and defaults: */ +struct files_stat_struct { + int nr_files; /* read only */ + int nr_free_files; /* read only */ + int max_files; /* tunable */ +}; +extern struct files_stat_struct files_stat; + +struct inodes_stat_t { + int nr_inodes; + int nr_unused; + int dummy[5]; +}; +extern struct inodes_stat_t inodes_stat; + +extern int leases_enable, dir_notify_enable, lease_break_time; + +#define NR_FILE 8192 /* this can well be larger on a larger system */ +#define NR_RESERVED_FILES 10 /* reserved for root */ +#define NR_SUPER 256 + +#define MAY_EXEC 1 +#define MAY_WRITE 2 +#define MAY_READ 4 +#define MAY_APPEND 8 + +#define FMODE_READ 1 +#define FMODE_WRITE 2 + +#define RW_MASK 1 +#define RWA_MASK 2 +#define READ 0 +#define WRITE 1 +#define READA 2 /* read-ahead - don't block if no resources */ +#define SPECIAL 4 /* For non-blockdevice requests in request queue */ + +#define SEL_IN 1 +#define SEL_OUT 2 +#define SEL_EX 4 + +/* public flags for file_system_type */ +#define FS_REQUIRES_DEV 1 +#define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ +#define FS_ODD_RENAME 32768 /* Temporary stuff; will go away as soon + * as nfs_rename() will be cleaned up + */ +/* + * These are the fs-independent mount-flags: up to 32 flags are supported + */ +#define MS_RDONLY 1 /* Mount read-only */ +#define MS_NOSUID 2 /* Ignore suid and sgid bits */ +#define MS_NODEV 4 /* Disallow access to device special files */ +#define MS_NOEXEC 8 /* Disallow program execution */ +#define MS_SYNCHRONOUS 16 /* Writes are synced at once */ +#define MS_REMOUNT 32 /* Alter flags of a mounted FS */ +#define MS_MANDLOCK 64 /* Allow mandatory locks on an FS */ +#define MS_DIRSYNC 128 /* Directory modifications are synchronous */ +#define MS_NOATIME 1024 /* Do not update access times. */ +#define MS_NODIRATIME 2048 /* Do not update directory access times */ +#define MS_BIND 4096 +#define MS_MOVE 8192 +#define MS_REC 16384 +#define MS_VERBOSE 32768 +#define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ +#define MS_ONE_SECOND (1<<17) /* fs has 1 sec a/m/ctime resolution */ +#define MS_ACTIVE (1<<30) +#define MS_NOUSER (1<<31) + +/* + * Superblock flags that can be altered by MS_REMOUNT + */ +#define MS_RMT_MASK (MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\ + MS_NODIRATIME) + +/* + * Old magic mount flag and mask + */ +#define MS_MGC_VAL 0xC0ED0000 +#define MS_MGC_MSK 0xffff0000 + +/* Inode flags - they have nothing to superblock flags now */ + +#define S_SYNC 1 /* Writes are synced at once */ +#define S_NOATIME 2 /* Do not update access times */ +#define S_QUOTA 4 /* Quota initialized for file */ +#define S_APPEND 8 /* Append-only file */ +#define S_IMMUTABLE 16 /* Immutable file */ +#define S_DEAD 32 /* removed, but still open directory */ +#define S_NOQUOTA 64 /* Inode is not counted to quota */ +#define S_DIRSYNC 128 /* Directory modifications are synchronous */ + +/* + * Note that nosuid etc flags are inode-specific: setting some file-system + * flags just means all the inodes inherit those flags by default. It might be + * possible to override it selectively if you really wanted to with some + * ioctl() that is not currently implemented. + * + * Exception: MS_RDONLY is always applied to the entire file system. + * + * Unfortunately, it is possible to change a filesystems flags with it mounted + * with files in use. This means that all of the inodes will not have their + * i_flags updated. Hence, i_flags no longer inherit the superblock mount + * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org + */ +#define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg)) + +#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) +#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \ + ((inode)->i_flags & S_SYNC)) +#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) || \ + ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) +#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) + +#define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA) +#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) +#define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) +#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) +#define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) +#define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) +#define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL) +#define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND) + +#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) + +/* the read-only stuff doesn't really belong here, but any other place is + probably as bad and I don't want to create yet another include file. */ + +#define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ +#define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ +#define BLKRRPART _IO(0x12,95) /* re-read partition table */ +#define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ +#define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ +#define BLKRASET _IO(0x12,98) /* set read ahead for block device */ +#define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ +#define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ +#define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ +#define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ +#define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ +#define BLKSSZGET _IO(0x12,104)/* get block device sector size */ +#if 0 +#define BLKPG _IO(0x12,105)/* See blkpg.h */ + +/* Some people are morons. Do not use sizeof! */ + +#define BLKELVGET _IOR(0x12,106,size_t)/* elevator get */ +#define BLKELVSET _IOW(0x12,107,size_t)/* elevator set */ +/* This was here just to show that the number is taken - + probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ +#endif +/* A jump here: 108-111 have been used for various private purposes. */ +#define BLKBSZGET _IOR(0x12,112,size_t) +#define BLKBSZSET _IOW(0x12,113,size_t) +#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ + +#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ +#define FIBMAP _IO(0x00,1) /* bmap access */ +#define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ + +#ifdef __KERNEL__ + +#include <asm/semaphore.h> +#include <asm/byteorder.h> + +/* Used to be a macro which just called the function, now just a function */ +extern void update_atime (struct inode *); + +extern void inode_init(unsigned long); +extern void mnt_init(unsigned long); +extern void files_init(unsigned long); + +struct buffer_head; +typedef int (get_block_t)(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); +typedef int (get_blocks_t)(struct inode *inode, sector_t iblock, + unsigned long max_blocks, + struct buffer_head *bh_result, int create); +typedef void (dio_iodone_t)(struct inode *inode, loff_t offset, + ssize_t bytes, void *private); + +/* + * Attribute flags. These should be or-ed together to figure out what + * has been changed! + */ +#define ATTR_MODE 1 +#define ATTR_UID 2 +#define ATTR_GID 4 +#define ATTR_SIZE 8 +#define ATTR_ATIME 16 +#define ATTR_MTIME 32 +#define ATTR_CTIME 64 +#define ATTR_ATIME_SET 128 +#define ATTR_MTIME_SET 256 +#define ATTR_FORCE 512 /* Not a change, but a change it */ +#define ATTR_ATTR_FLAG 1024 +#define ATTR_KILL_SUID 2048 +#define ATTR_KILL_SGID 4096 + +/* + * This is the Inode Attributes structure, used for notify_change(). It + * uses the above definitions as flags, to know which values have changed. + * Also, in this manner, a Filesystem can look at only the values it cares + * about. Basically, these are the attributes that the VFS layer can + * request to change from the FS layer. + * + * Derek Atkins <warlord@MIT.EDU> 94-10-20 + */ +struct iattr { + unsigned int ia_valid; + umode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + struct timespec ia_atime; + struct timespec ia_mtime; + struct timespec ia_ctime; + unsigned int ia_attr_flags; +}; + +/* + * This is the inode attributes flag definitions + */ +#define ATTR_FLAG_SYNCRONOUS 1 /* Syncronous write */ +#define ATTR_FLAG_NOATIME 2 /* Don't update atime */ +#define ATTR_FLAG_APPEND 4 /* Append-only file */ +#define ATTR_FLAG_IMMUTABLE 8 /* Immutable file */ +#define ATTR_FLAG_NODIRATIME 16 /* Don't update atime for directory */ + +/* + * Includes for diskquotas. + */ +#include <linux/quota.h> + +/* + * oh the beauties of C type declarations. + */ +struct page; +struct address_space; +struct writeback_control; +struct kiocb; + +struct address_space_operations { + int (*writepage)(struct page *page, struct writeback_control *wbc); + int (*readpage)(struct file *, struct page *); + int (*sync_page)(struct page *); + + /* Write back some dirty pages from this mapping. */ + int (*writepages)(struct address_space *, struct writeback_control *); + + /* Set a page dirty */ + int (*set_page_dirty)(struct page *page); + + int (*readpages)(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages); + + /* + * ext3 requires that a successful prepare_write() call be followed + * by a commit_write() call - they must be balanced + */ + int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); + int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ + sector_t (*bmap)(struct address_space *, sector_t); + int (*invalidatepage) (struct page *, unsigned long); + int (*releasepage) (struct page *, int); + int (*direct_IO)(int, struct kiocb *, const struct iovec *iov, + loff_t offset, unsigned long nr_segs); +}; + +struct backing_dev_info; +struct address_space { + struct inode *host; /* owner: inode, block_device */ + struct radix_tree_root page_tree; /* radix tree of all pages */ + spinlock_t page_lock; /* and spinlock protecting it */ + struct list_head clean_pages; /* list of clean pages */ + struct list_head dirty_pages; /* list of dirty pages */ + struct list_head locked_pages; /* list of locked pages */ + struct list_head io_pages; /* being prepared for I/O */ + unsigned long nrpages; /* number of total pages */ + struct address_space_operations *a_ops; /* methods */ + struct list_head i_mmap; /* list of private mappings */ + struct list_head i_mmap_shared; /* list of shared mappings */ + struct semaphore i_shared_sem; /* protect both above lists */ + atomic_t truncate_count; /* Cover race condition with truncate */ + unsigned long dirtied_when; /* jiffies of first page dirtying */ + unsigned long flags; /* error bits/gfp mask */ + struct backing_dev_info *backing_dev_info; /* device readahead, etc */ + spinlock_t private_lock; /* for use by the address_space */ + struct list_head private_list; /* ditto */ + struct address_space *assoc_mapping; /* ditto */ +}; + +struct block_device { + dev_t bd_dev; /* not a kdev_t - it's a search key */ + struct inode * bd_inode; /* will die */ + int bd_openers; + struct semaphore bd_sem; /* open/close mutex */ + struct list_head bd_inodes; + void * bd_holder; + int bd_holders; + struct block_device * bd_contains; + unsigned bd_block_size; + struct hd_struct * bd_part; + unsigned bd_part_count; + int bd_invalidated; + struct gendisk * bd_disk; + struct list_head bd_list; +}; + +/* + * Use sequence counter to get consistent i_size on 32-bit processors. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#include <linux/seqlock.h> +#define __NEED_I_SIZE_ORDERED +#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) +#else +#define i_size_ordered_init(inode) do { } while (0) +#endif + +struct inode { + struct hlist_node i_hash; + struct list_head i_list; + struct list_head i_dentry; + unsigned long i_ino; + atomic_t i_count; + umode_t i_mode; + unsigned int i_nlink; + uid_t i_uid; + gid_t i_gid; + dev_t i_rdev; + loff_t i_size; + struct timespec i_atime; + struct timespec i_mtime; + struct timespec i_ctime; + unsigned int i_blkbits; + unsigned long i_blksize; + unsigned long i_version; + unsigned long i_blocks; + unsigned short i_bytes; + spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ + struct semaphore i_sem; + struct inode_operations *i_op; + struct file_operations *i_fop; /* former ->i_op->default_file_ops */ + struct super_block *i_sb; + struct file_lock *i_flock; + struct address_space *i_mapping; + struct address_space i_data; + struct dquot *i_dquot[MAXQUOTAS]; + /* These three should probably be a union */ + struct list_head i_devices; + struct pipe_inode_info *i_pipe; + struct block_device *i_bdev; + struct cdev *i_cdev; + int i_cindex; + + unsigned long i_dnotify_mask; /* Directory notify events */ + struct dnotify_struct *i_dnotify; /* for directory notifications */ + + unsigned long i_state; + + unsigned int i_flags; + unsigned char i_sock; + + atomic_t i_writecount; + void *i_security; + __u32 i_generation; + union { + void *generic_ip; + } u; +#ifdef __NEED_I_SIZE_ORDERED + seqcount_t i_size_seqcount; +#endif +}; + +/* + * NOTE: in a 32bit arch with a preemptable kernel and + * an UP compile the i_size_read/write must be atomic + * with respect to the local cpu (unlike with preempt disabled), + * but they don't need to be atomic with respect to other cpus like in + * true SMP (so they need either to either locally disable irq around + * the read or for example on x86 they can be still implemented as a + * cmpxchg8b without the need of the lock prefix). For SMP compiles + * and 64bit archs it makes no difference if preempt is enabled or not. + */ +static inline loff_t i_size_read(struct inode *inode) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + loff_t i_size; + unsigned int seq; + + do { + seq = read_seqcount_begin(&inode->i_size_seqcount); + i_size = inode->i_size; + } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); + return i_size; +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) + loff_t i_size; + + preempt_disable(); + i_size = inode->i_size; + preempt_enable(); + return i_size; +#else + return inode->i_size; +#endif +} + + +static inline void i_size_write(struct inode *inode, loff_t i_size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + write_seqcount_begin(&inode->i_size_seqcount); + inode->i_size = i_size; + write_seqcount_end(&inode->i_size_seqcount); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) + preempt_disable(); + inode->i_size = i_size; + preempt_enable(); +#else + inode->i_size = i_size; +#endif +} + +static inline unsigned iminor(struct inode *inode) +{ + return MINOR(inode->i_rdev); +} + +static inline unsigned imajor(struct inode *inode) +{ + return MAJOR(inode->i_rdev); +} + +struct fown_struct { + rwlock_t lock; /* protects pid, uid, euid fields */ + int pid; /* pid or -pgrp where SIGIO should be sent */ + uid_t uid, euid; /* uid/euid of process setting the owner */ + int signum; /* posix.1b rt signal to be delivered on IO */ + void *security; +}; + +/* + * Track a single file's readahead state + */ +struct file_ra_state { + unsigned long start; /* Current window */ + unsigned long size; + unsigned long next_size; /* Next window size */ + unsigned long prev_page; /* Cache last read() position */ + unsigned long ahead_start; /* Ahead window */ + unsigned long ahead_size; + unsigned long ra_pages; /* Maximum readahead window */ + unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ + unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ +}; + +struct file { + struct list_head f_list; + struct dentry *f_dentry; + struct vfsmount *f_vfsmnt; + struct file_operations *f_op; + atomic_t f_count; + unsigned int f_flags; + mode_t f_mode; + loff_t f_pos; + struct fown_struct f_owner; + unsigned int f_uid, f_gid; + int f_error; + struct file_ra_state f_ra; + + unsigned long f_version; + void *f_security; + + /* needed for tty driver, and maybe others */ + void *private_data; + + /* Used by fs/eventpoll.c to link all the hooks to this file */ + struct list_head f_ep_links; + spinlock_t f_ep_lock; +}; +extern spinlock_t files_lock; +#define file_list_lock() spin_lock(&files_lock); +#define file_list_unlock() spin_unlock(&files_lock); + +#define get_file(x) atomic_inc(&(x)->f_count) +#define file_count(x) atomic_read(&(x)->f_count) + +/* Initialize and open a private file and allocate its security structure. */ +extern int open_private_file(struct file *, struct dentry *, int); +/* Release a private file and free its security structure. */ +extern void close_private_file(struct file *file); + +#define MAX_NON_LFS ((1UL<<31) - 1) + +/* Page cache limit. The filesystems should put that into their s_maxbytes + limits, otherwise bad things can happen in VM. */ +#if BITS_PER_LONG==32 +#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) +#elif BITS_PER_LONG==64 +#define MAX_LFS_FILESIZE 0x7fffffffffffffff +#endif + +#define FL_POSIX 1 +#define FL_FLOCK 2 +#define FL_ACCESS 8 /* not trying to lock, just looking */ +#define FL_LOCKD 16 /* lock held by rpc.lockd */ +#define FL_LEASE 32 /* lease held on this file */ +#define FL_SLEEP 128 /* A blocking lock */ + +/* + * The POSIX file lock owner is determined by + * the "struct files_struct" in the thread group + * (or NULL for no owner - BSD locks). + * + * Lockd stuffs a "host" pointer into this. + */ +typedef struct files_struct *fl_owner_t; + +/* that will die - we need it for nfs_lock_info */ +#include <linux/nfs_fs_i.h> + +struct file_lock { + struct file_lock *fl_next; /* singly linked list for this inode */ + struct list_head fl_link; /* doubly linked list of all locks */ + struct list_head fl_block; /* circular list of blocked processes */ + fl_owner_t fl_owner; + unsigned int fl_pid; + wait_queue_head_t fl_wait; + struct file *fl_file; + unsigned char fl_flags; + unsigned char fl_type; + loff_t fl_start; + loff_t fl_end; + + void (*fl_notify)(struct file_lock *); /* unblock callback */ + void (*fl_insert)(struct file_lock *); /* lock insertion callback */ + void (*fl_remove)(struct file_lock *); /* lock removal callback */ + + struct fasync_struct * fl_fasync; /* for lease break notifications */ + unsigned long fl_break_time; /* for nonblocking lease breaks */ + + union { + struct nfs_lock_info nfs_fl; + } fl_u; +}; + +/* The following constant reflects the upper bound of the file/locking space */ +#ifndef OFFSET_MAX +#define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) +#define OFFSET_MAX INT_LIMIT(loff_t) +#define OFFT_OFFSET_MAX INT_LIMIT(off_t) +#endif + +extern struct list_head file_lock_list; + +#include <linux/fcntl.h> + +extern int fcntl_getlk(struct file *, struct flock __user *); +extern int fcntl_setlk(struct file *, unsigned int, struct flock __user *); + +#if BITS_PER_LONG == 32 +extern int fcntl_getlk64(struct file *, struct flock64 __user *); +extern int fcntl_setlk64(struct file *, unsigned int, struct flock64 __user *); +#endif + +extern void send_sigio(struct fown_struct *fown, int fd, int band); +extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); +extern int fcntl_getlease(struct file *filp); + +/* fs/locks.c */ +extern void locks_init_lock(struct file_lock *); +extern void locks_copy_lock(struct file_lock *, struct file_lock *); +extern void locks_remove_posix(struct file *, fl_owner_t); +extern void locks_remove_flock(struct file *); +extern struct file_lock *posix_test_lock(struct file *, struct file_lock *); +extern int posix_lock_file(struct file *, struct file_lock *); +extern void posix_block_lock(struct file_lock *, struct file_lock *); +extern void posix_unblock_lock(struct file *, struct file_lock *); +extern int posix_locks_deadlock(struct file_lock *, struct file_lock *); +extern int __break_lease(struct inode *inode, unsigned int flags); +extern void lease_get_mtime(struct inode *, struct timespec *time); +extern int lock_may_read(struct inode *, loff_t start, unsigned long count); +extern int lock_may_write(struct inode *, loff_t start, unsigned long count); + +struct fasync_struct { + int magic; + int fa_fd; + struct fasync_struct *fa_next; /* singly linked list */ + struct file *fa_file; +}; + +#define FASYNC_MAGIC 0x4601 + +/* SMP safe fasync helpers: */ +extern int fasync_helper(int, struct file *, int, struct fasync_struct **); +/* can be called from interrupts */ +extern void kill_fasync(struct fasync_struct **, int, int); +/* only for net: no internal synchronization */ +extern void __kill_fasync(struct fasync_struct *, int, int); + +extern int f_setown(struct file *filp, unsigned long arg, int force); +extern void f_delown(struct file *filp); +extern int send_sigurg(struct fown_struct *fown); + +/* + * Umount options + */ + +#define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ +#define MNT_DETACH 0x00000002 /* Just detach from the tree */ + +extern struct list_head super_blocks; +extern spinlock_t sb_lock; + +#define sb_entry(list) list_entry((list), struct super_block, s_list) +#define S_BIAS (1<<30) +struct super_block { + struct list_head s_list; /* Keep this first */ + dev_t s_dev; /* search index; _not_ kdev_t */ + unsigned long s_blocksize; + unsigned long s_old_blocksize; + unsigned char s_blocksize_bits; + unsigned char s_dirt; + unsigned long long s_maxbytes; /* Max file size */ + struct file_system_type *s_type; + struct super_operations *s_op; + struct dquot_operations *dq_op; + struct quotactl_ops *s_qcop; + struct export_operations *s_export_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; + struct rw_semaphore s_umount; + struct semaphore s_lock; + int s_count; + int s_syncing; + int s_need_sync_fs; + atomic_t s_active; + void *s_security; + + struct list_head s_dirty; /* dirty inodes */ + struct list_head s_io; /* parked for writeback */ + struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ + struct list_head s_files; + + struct block_device *s_bdev; + struct list_head s_instances; + struct quota_info s_dquot; /* Diskquota specific options */ + + char s_id[32]; /* Informational name */ + + struct kobject kobj; /* anchor for sysfs */ + void *s_fs_info; /* Filesystem private info */ + + /* + * The next field is for VFS *only*. No filesystems have any business + * even looking at it. You had been warned. + */ + struct semaphore s_vfs_rename_sem; /* Kludge */ +}; + +/* + * Superblock locking. + */ +static inline void lock_super(struct super_block * sb) +{ + down(&sb->s_lock); +} + +static inline void unlock_super(struct super_block * sb) +{ + up(&sb->s_lock); +} + +/* + * VFS helper functions.. + */ +extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *); +extern int vfs_mkdir(struct inode *, struct dentry *, int); +extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); +extern int vfs_symlink(struct inode *, struct dentry *, const char *); +extern int vfs_link(struct dentry *, struct inode *, struct dentry *); +extern int vfs_rmdir(struct inode *, struct dentry *); +extern int vfs_unlink(struct inode *, struct dentry *); +extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); + +/* + * File types + * + * NOTE! These match bits 12..15 of stat.st_mode + * (ie "(i_mode >> 12) & 15"). + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +/* + * This is the "filldir" function type, used by readdir() to let + * the kernel specify what kind of dirent layout it wants to have. + * This allows the kernel to read directories into kernel space or + * to have different dirent layouts depending on the binary type. + */ +typedef int (*filldir_t)(void *, const char *, int, loff_t, ino_t, unsigned); + +struct block_device_operations { + int (*open) (struct inode *, struct file *); + int (*release) (struct inode *, struct file *); + int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + int (*media_changed) (struct gendisk *); + int (*revalidate_disk) (struct gendisk *); + struct module *owner; +}; + +/* + * "descriptor" for what we're up to with a read for sendfile(). + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ +typedef struct { + size_t written; + size_t count; + char __user * buf; + int error; +} read_descriptor_t; + +typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); + +/* + * NOTE: + * read, write, poll, fsync, readv, writev can be called + * without the big kernel lock held in all filesystems. + */ +struct file_operations { + struct module *owner; + loff_t (*llseek) (struct file *, loff_t, int); + ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); + ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t); + ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); + ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t, loff_t); + int (*readdir) (struct file *, void *, filldir_t); + unsigned int (*poll) (struct file *, struct poll_table_struct *); + int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); + int (*mmap) (struct file *, struct vm_area_struct *); + int (*open) (struct inode *, struct file *); + int (*flush) (struct file *); + int (*release) (struct inode *, struct file *); + int (*fsync) (struct file *, struct dentry *, int datasync); + int (*aio_fsync) (struct kiocb *, int datasync); + int (*fasync) (int, struct file *, int); + int (*lock) (struct file *, int, struct file_lock *); + ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void __user *); + ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +}; + +struct inode_operations { + int (*create) (struct inode *,struct dentry *,int, struct nameidata *); + struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); + int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*unlink) (struct inode *,struct dentry *); + int (*symlink) (struct inode *,struct dentry *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); + int (*rmdir) (struct inode *,struct dentry *); + int (*mknod) (struct inode *,struct dentry *,int,dev_t); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); + int (*readlink) (struct dentry *, char __user *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int, struct nameidata *); + int (*setattr) (struct dentry *, struct iattr *); + int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); + int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); + ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); + ssize_t (*listxattr) (struct dentry *, char *, size_t); + int (*removexattr) (struct dentry *, const char *); +}; + +struct seq_file; + +extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); +extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); +extern ssize_t vfs_readv(struct file *, const struct iovec __user *, + unsigned long, loff_t *); +extern ssize_t vfs_writev(struct file *, const struct iovec __user *, + unsigned long, loff_t *); + +/* + * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called + * without the big kernel lock held in all filesystems. + */ +struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *); + + void (*read_inode) (struct inode *); + + void (*dirty_inode) (struct inode *); + void (*write_inode) (struct inode *, int); + void (*put_inode) (struct inode *); + void (*drop_inode) (struct inode *); + void (*delete_inode) (struct inode *); + void (*put_super) (struct super_block *); + void (*write_super) (struct super_block *); + int (*sync_fs)(struct super_block *sb, int wait); + void (*write_super_lockfs) (struct super_block *); + void (*unlockfs) (struct super_block *); + int (*statfs) (struct super_block *, struct kstatfs *); + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); + + int (*show_options)(struct seq_file *, struct vfsmount *); +}; + +/* Inode state bits. Protected by inode_lock. */ +#define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */ +#define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */ +#define I_DIRTY_PAGES 4 /* Data-related inode changes pending */ +#define I_LOCK 8 +#define I_FREEING 16 +#define I_CLEAR 32 +#define I_NEW 64 + +#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) + +extern void __mark_inode_dirty(struct inode *, int); +static inline void mark_inode_dirty(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY); +} + +static inline void mark_inode_dirty_sync(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY_SYNC); +} + + +/** + * &export_operations - for nfsd to communicate with file systems + * decode_fh: decode a file handle fragment and return a &struct dentry + * encode_fh: encode a file handle fragment from a dentry + * get_name: find the name for a given inode in a given directory + * get_parent: find the parent of a given directory + * get_dentry: find a dentry for the inode given a file handle sub-fragment + * + * Description: + * The export_operations structure provides a means for nfsd to communicate + * with a particular exported file system - particularly enabling nfsd and + * the filesystem to co-operate when dealing with file handles. + * + * export_operations contains two basic operation for dealing with file handles, + * decode_fh() and encode_fh(), and allows for some other operations to be defined + * which standard helper routines use to get specific information from the + * filesystem. + * + * nfsd encodes information use to determine which filesystem a filehandle + * applies to in the initial part of the file handle. The remainder, termed a + * file handle fragment, is controlled completely by the filesystem. + * The standard helper routines assume that this fragment will contain one or two + * sub-fragments, one which identifies the file, and one which may be used to + * identify the (a) directory containing the file. + * + * In some situations, nfsd needs to get a dentry which is connected into a + * specific part of the file tree. To allow for this, it passes the function + * acceptable() together with a @context which can be used to see if the dentry + * is acceptable. As there can be multiple dentrys for a given file, the filesystem + * should check each one for acceptability before looking for the next. As soon + * as an acceptable one is found, it should be returned. + * + * decode_fh: + * @decode_fh is given a &struct super_block (@sb), a file handle fragment (@fh, @fh_len) + * and an acceptability testing function (@acceptable, @context). It should return + * a &struct dentry which refers to the same file that the file handle fragment refers + * to, and which passes the acceptability test. If it cannot, it should return + * a %NULL pointer if the file was found but no acceptable &dentries were available, or + * a %ERR_PTR error code indicating why it couldn't be found (e.g. %ENOENT or %ENOMEM). + * + * encode_fh: + * @encode_fh should store in the file handle fragment @fh (using at most @max_len bytes) + * information that can be used by @decode_fh to recover the file refered to by the + * &struct dentry @de. If the @connectable flag is set, the encode_fh() should store + * sufficient information so that a good attempt can be made to find not only + * the file but also it's place in the filesystem. This typically means storing + * a reference to de->d_parent in the filehandle fragment. + * encode_fh() should return the number of bytes stored or a negative error code + * such as %-ENOSPC + * + * get_name: + * @get_name should find a name for the given @child in the given @parent directory. + * The name should be stored in the @name (with the understanding that it is already + * pointing to a a %NAME_MAX+1 sized buffer. get_name() should return %0 on success, + * a negative error code or error. + * @get_name will be called without @parent->i_sem held. + * + * get_parent: + * @get_parent should find the parent directory for the given @child which is also + * a directory. In the event that it cannot be found, or storage space cannot be + * allocated, a %ERR_PTR should be returned. + * + * get_dentry: + * Given a &super_block (@sb) and a pointer to a file-system specific inode identifier, + * possibly an inode number, (@inump) get_dentry() should find the identified inode and + * return a dentry for that inode. + * Any suitable dentry can be returned including, if necessary, a new dentry created + * with d_alloc_root. The caller can then find any other extant dentrys by following the + * d_alias links. If a new dentry was created using d_alloc_root, DCACHE_NFSD_DISCONNECTED + * should be set, and the dentry should be d_rehash()ed. + * + * If the inode cannot be found, either a %NULL pointer or an %ERR_PTR code can be returned. + * The @inump will be whatever was passed to nfsd_find_fh_dentry() in either the + * @obj or @parent parameters. + * + * Locking rules: + * get_parent is called with child->d_inode->i_sem down + * get_name is not (which is possibly inconsistent) + */ + +struct export_operations { + struct dentry *(*decode_fh)(struct super_block *sb, __u32 *fh, int fh_len, int fh_type, + int (*acceptable)(void *context, struct dentry *de), + void *context); + int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len, + int connectable); + + /* the following are only called from the filesystem itself */ + int (*get_name)(struct dentry *parent, char *name, + struct dentry *child); + struct dentry * (*get_parent)(struct dentry *child); + struct dentry * (*get_dentry)(struct super_block *sb, void *inump); + + /* This is set by the exporting module to a standard helper */ + struct dentry * (*find_exported_dentry)( + struct super_block *sb, void *obj, void *parent, + int (*acceptable)(void *context, struct dentry *de), + void *context); + + +}; + + +struct file_system_type { + const char *name; + int fs_flags; + struct super_block *(*get_sb) (struct file_system_type *, int, + const char *, void *); + void (*kill_sb) (struct super_block *); + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; +}; + +struct super_block *get_sb_bdev(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct super_block *get_sb_single(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)); +struct super_block *get_sb_nodev(struct file_system_type *fs_type, + int flags, void *data, + int (*fill_super)(struct super_block *, void *, int)); +void generic_shutdown_super(struct super_block *sb); +void kill_block_super(struct super_block *sb); +void kill_anon_super(struct super_block *sb); +void kill_litter_super(struct super_block *sb); +void deactivate_super(struct super_block *sb); +int set_anon_super(struct super_block *s, void *data); +struct super_block *sget(struct file_system_type *type, + int (*test)(struct super_block *,void *), + int (*set)(struct super_block *,void *), + void *data); +struct super_block *get_sb_pseudo(struct file_system_type *, char *, + struct super_operations *ops, unsigned long); + +/* Alas, no aliases. Too much hassle with bringing module.h everywhere */ +#define fops_get(fops) \ + (((fops) && try_module_get((fops)->owner) ? (fops) : NULL)) +#define fops_put(fops) \ + do { if (fops) module_put((fops)->owner); } while(0) + +extern int register_filesystem(struct file_system_type *); +extern int unregister_filesystem(struct file_system_type *); +extern struct vfsmount *kern_mount(struct file_system_type *); +extern int may_umount(struct vfsmount *); +extern long do_mount(char *, char *, char *, unsigned long, void *); + +extern int vfs_statfs(struct super_block *, struct kstatfs *); + +/* Return value for VFS lock functions - tells locks.c to lock conventionally + * REALLY kosha for root NFS and nfs_lock + */ +#define LOCK_USE_CLNT 1 + +#define FLOCK_VERIFY_READ 1 +#define FLOCK_VERIFY_WRITE 2 + +extern int locks_mandatory_locked(struct inode *); +extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); + +/* + * Candidates for mandatory locking have the setgid bit set + * but no group execute bit - an otherwise meaningless combination. + */ +#define MANDATORY_LOCK(inode) \ + (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + +static inline int locks_verify_locked(struct inode *inode) +{ + if (MANDATORY_LOCK(inode)) + return locks_mandatory_locked(inode); + return 0; +} + +static inline int locks_verify_area(int read_write, struct inode *inode, + struct file *filp, loff_t offset, + size_t count) +{ + if (inode->i_flock && MANDATORY_LOCK(inode)) + return locks_mandatory_area(read_write, inode, filp, offset, count); + return 0; +} + +static inline int locks_verify_truncate(struct inode *inode, + struct file *filp, + loff_t size) +{ + if (inode->i_flock && MANDATORY_LOCK(inode)) + return locks_mandatory_area( + FLOCK_VERIFY_WRITE, inode, filp, + size < inode->i_size ? size : inode->i_size, + (size < inode->i_size ? inode->i_size - size + : size - inode->i_size) + ); + return 0; +} + +static inline int break_lease(struct inode *inode, unsigned int mode) +{ + if (inode->i_flock) + return __break_lease(inode, mode); + return 0; +} + +/* fs/open.c */ + +asmlinkage long sys_open(const char __user *, int, int); +asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ +extern int do_truncate(struct dentry *, loff_t start); + +extern struct file *filp_open(const char *, int, int); +extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); +extern int filp_close(struct file *, fl_owner_t id); +extern char * getname(const char __user *); + +/* fs/dcache.c */ +extern void vfs_caches_init(unsigned long); + +#define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL) +#define putname(name) kmem_cache_free(names_cachep, (void *)(name)) + +enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW}; +extern int register_blkdev(unsigned int, const char *); +extern int unregister_blkdev(unsigned int, const char *); +extern struct block_device *bdget(dev_t); +extern int bd_acquire(struct inode *inode); +extern void bd_forget(struct inode *inode); +extern void bdput(struct block_device *); +extern int blkdev_open(struct inode *, struct file *); +extern int blkdev_close(struct inode *, struct file *); +extern struct block_device *open_by_devnum(dev_t, unsigned, int); +extern struct file_operations def_blk_fops; +extern struct address_space_operations def_blk_aops; +extern struct file_operations def_chr_fops; +extern struct file_operations bad_sock_fops; +extern struct file_operations def_fifo_fops; +extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); +extern int blkdev_ioctl(struct inode *, struct file *, unsigned, unsigned long); +extern int blkdev_get(struct block_device *, mode_t, unsigned, int); +extern int blkdev_put(struct block_device *, int); +extern int bd_claim(struct block_device *, void *); +extern void bd_release(struct block_device *); +extern void blk_run_queues(void); + +/* fs/char_dev.c */ +extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, char *); +extern int register_chrdev_region(dev_t, unsigned, char *); +extern int register_chrdev(unsigned int, const char *, + struct file_operations *); +extern int unregister_chrdev(unsigned int, const char *); +extern void unregister_chrdev_region(dev_t, unsigned); +extern int chrdev_open(struct inode *, struct file *); + +/* fs/block_dev.c */ +#define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ +extern const char *__bdevname(dev_t, char *buffer); +extern const char *bdevname(struct block_device *bdev, char *buffer); +extern struct block_device *lookup_bdev(const char *); +extern struct block_device *open_bdev_excl(const char *, int, int, void *); +extern void close_bdev_excl(struct block_device *, int); + +extern void init_special_inode(struct inode *, umode_t, dev_t); + +/* Invalid inode operations -- fs/bad_inode.c */ +extern void make_bad_inode(struct inode *); +extern int is_bad_inode(struct inode *); + +extern struct file_operations read_fifo_fops; +extern struct file_operations write_fifo_fops; +extern struct file_operations rdwr_fifo_fops; +extern struct file_operations read_pipe_fops; +extern struct file_operations write_pipe_fops; +extern struct file_operations rdwr_pipe_fops; + +extern int fs_may_remount_ro(struct super_block *); + +/* + * return READ, READA, or WRITE + */ +#define bio_rw(bio) ((bio)->bi_rw & (RW_MASK | RWA_MASK)) + +/* + * return data direction, READ or WRITE + */ +#define bio_data_dir(bio) ((bio)->bi_rw & 1) + +extern int check_disk_change(struct block_device *); +extern int invalidate_inodes(struct super_block *); +extern int __invalidate_device(struct block_device *, int); +extern int invalidate_partition(struct gendisk *, int); +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end); +unsigned long invalidate_inode_pages(struct address_space *mapping); +static inline void invalidate_remote_inode(struct inode *inode) +{ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) + invalidate_inode_pages(inode->i_mapping); +} +extern void invalidate_inode_pages2(struct address_space *mapping); +extern void write_inode_now(struct inode *, int); +extern int filemap_fdatawrite(struct address_space *); +extern int filemap_flush(struct address_space *); +extern int filemap_fdatawait(struct address_space *); +extern void sync_supers(void); +extern void sync_filesystems(int wait); +extern void emergency_sync(void); +extern void emergency_remount(void); +extern int do_remount_sb(struct super_block *sb, int flags, + void *data, int force); +extern sector_t bmap(struct inode *, sector_t); +extern int setattr_mask(unsigned int); +extern int notify_change(struct dentry *, struct iattr *); +extern int permission(struct inode *, int, struct nameidata *); +extern int vfs_permission(struct inode *, int); +extern int get_write_access(struct inode *); +extern int deny_write_access(struct file *); +static inline void put_write_access(struct inode * inode) +{ + atomic_dec(&inode->i_writecount); +} +static inline void allow_write_access(struct file *file) +{ + if (file) + atomic_inc(&file->f_dentry->d_inode->i_writecount); +} +extern int do_pipe(int *); + +extern int open_namei(const char *, int, int, struct nameidata *); +extern int may_open(struct nameidata *, int, int); + +extern int kernel_read(struct file *, unsigned long, char *, unsigned long); +extern struct file * open_exec(const char *); + +/* fs/dcache.c -- generic fs support functions */ +extern int is_subdir(struct dentry *, struct dentry *); +extern ino_t find_inode_number(struct dentry *, struct qstr *); + +#include <linux/err.h> + +/* needed for stackable file system support */ +extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + +extern void inode_init_once(struct inode *); +extern void iput(struct inode *); +extern struct inode * igrab(struct inode *); +extern ino_t iunique(struct super_block *, ino_t); +extern int inode_needs_sync(struct inode *inode); +extern void generic_delete_inode(struct inode *inode); + +extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data); +extern struct inode *ilookup(struct super_block *sb, unsigned long ino); + +extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); +extern struct inode * iget_locked(struct super_block *, unsigned long); +extern void unlock_new_inode(struct inode *); + +static inline struct inode *iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode = iget_locked(sb, ino); + + if (inode && (inode->i_state & I_NEW)) { + sb->s_op->read_inode(inode); + unlock_new_inode(inode); + } + + return inode; +} + +extern void __iget(struct inode * inode); +extern void clear_inode(struct inode *); +extern void destroy_inode(struct inode *); +extern struct inode *new_inode(struct super_block *); +extern void remove_suid(struct dentry *); + +extern void __insert_inode_hash(struct inode *, unsigned long hashval); +extern void remove_inode_hash(struct inode *); +static inline void insert_inode_hash(struct inode *inode) { + __insert_inode_hash(inode, inode->i_ino); +} + +extern struct file * get_empty_filp(void); +extern void file_move(struct file *f, struct list_head *list); +extern void file_kill(struct file *f); +struct bio; +extern int submit_bio(int, struct bio *); +extern int bdev_read_only(struct block_device *); +extern int set_blocksize(struct block_device *, int); +extern int sb_set_blocksize(struct super_block *, int); +extern int sb_min_blocksize(struct super_block *, int); + +extern int generic_file_mmap(struct file *, struct vm_area_struct *); +extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); +extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); +extern int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); +extern ssize_t generic_file_read(struct file *, char __user *, size_t, loff_t *); +int generic_write_checks(struct inode *inode, struct file *file, + loff_t *pos, size_t *count, int isblk); +extern ssize_t generic_file_write(struct file *, const char __user *, size_t, loff_t *); +extern ssize_t generic_file_aio_read(struct kiocb *, char __user *, size_t, loff_t); +extern ssize_t __generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t *); +extern ssize_t generic_file_aio_write(struct kiocb *, const char __user *, size_t, loff_t); +extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, + unsigned long, loff_t *); +extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); +extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); +ssize_t generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +extern ssize_t generic_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void __user *); +extern void do_generic_mapping_read(struct address_space *, struct file_ra_state *, struct file *, + loff_t *, read_descriptor_t *, read_actor_t); +extern void +file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); +extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, unsigned long nr_segs); +extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io); +extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos); +extern loff_t no_llseek(struct file *file, loff_t offset, int origin); +extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); +extern loff_t remote_llseek(struct file *file, loff_t offset, int origin); +extern int generic_file_open(struct inode * inode, struct file * filp); + +static inline void do_generic_file_read(struct file * filp, loff_t *ppos, + read_descriptor_t * desc, + read_actor_t actor) +{ + do_generic_mapping_read(filp->f_dentry->d_inode->i_mapping, + &filp->f_ra, + filp, + ppos, + desc, + actor); +} + +extern struct file_operations generic_ro_fops; + +#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + +extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +extern int vfs_follow_link(struct nameidata *, const char *); +extern int page_readlink(struct dentry *, char __user *, int); +extern int page_follow_link(struct dentry *, struct nameidata *); +extern int page_symlink(struct inode *inode, const char *symname, int len); +extern struct inode_operations page_symlink_inode_operations; +extern void generic_fillattr(struct inode *, struct kstat *); +extern int vfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); +void inode_add_bytes(struct inode *inode, loff_t bytes); +void inode_sub_bytes(struct inode *inode, loff_t bytes); +loff_t inode_get_bytes(struct inode *inode); +void inode_set_bytes(struct inode *inode, loff_t bytes); + +extern int vfs_readdir(struct file *, filldir_t, void *); + +extern int vfs_stat(char __user *, struct kstat *); +extern int vfs_lstat(char __user *, struct kstat *); +extern int vfs_fstat(unsigned int, struct kstat *); + +extern struct file_system_type *get_fs_type(const char *name); +extern struct super_block *get_super(struct block_device *); +extern struct super_block *user_get_super(dev_t); +extern void drop_super(struct super_block *sb); + +extern int dcache_dir_open(struct inode *, struct file *); +extern int dcache_dir_close(struct inode *, struct file *); +extern loff_t dcache_dir_lseek(struct file *, loff_t, int); +extern int dcache_readdir(struct file *, void *, filldir_t); +extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); +extern int simple_statfs(struct super_block *, struct kstatfs *); +extern int simple_link(struct dentry *, struct inode *, struct dentry *); +extern int simple_unlink(struct inode *, struct dentry *); +extern int simple_rmdir(struct inode *, struct dentry *); +extern int simple_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int simple_sync_file(struct file *, struct dentry *, int); +extern int simple_empty(struct dentry *); +extern int simple_readpage(struct file *file, struct page *page); +extern int simple_prepare_write(struct file *file, struct page *page, + unsigned offset, unsigned to); +extern int simple_commit_write(struct file *file, struct page *page, + unsigned offset, unsigned to); + +extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); +extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); +extern struct file_operations simple_dir_operations; +extern struct inode_operations simple_dir_inode_operations; +struct tree_descr { char *name; struct file_operations *ops; int mode; }; +extern int simple_fill_super(struct super_block *, int, struct tree_descr *); +extern int simple_pin_fs(char *name, struct vfsmount **mount, int *count); +extern void simple_release_fs(struct vfsmount **mount, int *count); + +extern int inode_change_ok(struct inode *, struct iattr *); +extern int inode_setattr(struct inode *, struct iattr *); + +extern void inode_update_time(struct inode *inode, int ctime_too); + +static inline ino_t parent_ino(struct dentry *dentry) +{ + ino_t res; + + spin_lock(&dentry->d_lock); + res = dentry->d_parent->d_inode->i_ino; + spin_unlock(&dentry->d_lock); + return res; +} + +#endif /* __KERNEL__ */ +#endif /* _LINUX_FS_H */ diff -Nru a/include/linux/in.h b/include/linux/in.h --- a/include/linux/in.h Fri Oct 31 14:10:54 2003 +++ b/include/linux/in.h Fri Oct 31 14:10:54 2003 @@ -140,29 +140,29 @@ struct group_req { - __u32 gr_interface; /* interface index */ - struct sockaddr_storage gr_group; /* group address */ + __u32 gr_interface; /* interface index */ + struct __kernel_sockaddr_storage gr_group; /* group address */ }; struct group_source_req { - __u32 gsr_interface; /* interface index */ - struct sockaddr_storage gsr_group; /* group address */ - struct sockaddr_storage gsr_source; /* source address */ + __u32 gsr_interface; /* interface index */ + struct __kernel_sockaddr_storage gsr_group; /* group address */ + struct __kernel_sockaddr_storage gsr_source; /* source address */ }; struct group_filter { - __u32 gf_interface; /* interface index */ - struct sockaddr_storage gf_group; /* multicast address */ - __u32 gf_fmode; /* filter mode */ - __u32 gf_numsrc; /* number of sources */ - struct sockaddr_storage gf_slist[1]; /* interface index */ + __u32 gf_interface; /* interface index */ + struct __kernel_sockaddr_storage gf_group; /* multicast address */ + __u32 gf_fmode; /* filter mode */ + __u32 gf_numsrc; /* number of sources */ + struct __kernel_sockaddr_storage gf_slist[1]; /* interface index */ }; #define GROUP_FILTER_SIZE(numsrc) \ - (sizeof(struct group_filter) - sizeof(struct sockaddr_storage) \ - + (numsrc) * sizeof(struct sockaddr_storage)) + (sizeof(struct group_filter) - sizeof(struct __kernel_sockaddr_storage) \ + + (numsrc) * sizeof(struct __kernel_sockaddr_storage)) struct in_pktinfo { diff -Nru a/include/linux/init_task.h b/include/linux/init_task.h --- a/include/linux/init_task.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/init_task.h Fri Oct 31 14:10:53 2003 @@ -107,7 +107,7 @@ .alloc_lock = SPIN_LOCK_UNLOCKED, \ .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ - .journal_info = NULL, \ + .fs_context = NULL, \ } diff -Nru a/include/linux/init_task.h~fs_activation.diff b/include/linux/init_task.h~fs_activation.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/init_task.h~fs_activation.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,115 @@ +#ifndef _LINUX__INIT_TASK_H +#define _LINUX__INIT_TASK_H + +#include <linux/file.h> + +#define INIT_FILES \ +{ \ + .count = ATOMIC_INIT(1), \ + .file_lock = SPIN_LOCK_UNLOCKED, \ + .max_fds = NR_OPEN_DEFAULT, \ + .max_fdset = __FD_SETSIZE, \ + .next_fd = 0, \ + .fd = &init_files.fd_array[0], \ + .close_on_exec = &init_files.close_on_exec_init, \ + .open_fds = &init_files.open_fds_init, \ + .close_on_exec_init = { { 0, } }, \ + .open_fds_init = { { 0, } }, \ + .fd_array = { NULL, } \ +} + +#define INIT_KIOCTX(name, which_mm) \ +{ \ + .users = ATOMIC_INIT(1), \ + .dead = 0, \ + .mm = &which_mm, \ + .user_id = 0, \ + .next = NULL, \ + .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait), \ + .ctx_lock = SPIN_LOCK_UNLOCKED, \ + .reqs_active = 0U, \ + .max_reqs = ~0U, \ +} + +#define INIT_MM(name) \ +{ \ + .mm_rb = RB_ROOT, \ + .pgd = swapper_pg_dir, \ + .mm_users = ATOMIC_INIT(2), \ + .mm_count = ATOMIC_INIT(1), \ + .mmap_sem = __RWSEM_INITIALIZER(name.mmap_sem), \ + .page_table_lock = SPIN_LOCK_UNLOCKED, \ + .mmlist = LIST_HEAD_INIT(name.mmlist), \ + .default_kioctx = INIT_KIOCTX(name.default_kioctx, name), \ +} + +#define INIT_SIGNALS(sig) { \ + .count = ATOMIC_INIT(1), \ + .shared_pending = { \ + .list = LIST_HEAD_INIT(sig.shared_pending.list), \ + .signal = {{0}}}, \ +} + +#define INIT_SIGHAND(sighand) { \ + .count = ATOMIC_INIT(1), \ + .action = { {{0,}}, }, \ + .siglock = SPIN_LOCK_UNLOCKED, \ +} + +/* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) + */ +#define INIT_TASK(tsk) \ +{ \ + .state = 0, \ + .thread_info = &init_thread_info, \ + .usage = ATOMIC_INIT(2), \ + .flags = 0, \ + .lock_depth = -1, \ + .prio = MAX_PRIO-20, \ + .static_prio = MAX_PRIO-20, \ + .policy = SCHED_NORMAL, \ + .cpus_allowed = CPU_MASK_ALL, \ + .mm = NULL, \ + .active_mm = &init_mm, \ + .run_list = LIST_HEAD_INIT(tsk.run_list), \ + .time_slice = HZ, \ + .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ + .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ + .real_parent = &tsk, \ + .parent = &tsk, \ + .children = LIST_HEAD_INIT(tsk.children), \ + .sibling = LIST_HEAD_INIT(tsk.sibling), \ + .group_leader = &tsk, \ + .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ + .real_timer = { \ + .function = it_real_fn \ + }, \ + .cap_effective = CAP_INIT_EFF_SET, \ + .cap_inheritable = CAP_INIT_INH_SET, \ + .cap_permitted = CAP_FULL_SET, \ + .keep_capabilities = 0, \ + .rlim = INIT_RLIMITS, \ + .user = INIT_USER, \ + .comm = "swapper", \ + .thread = INIT_THREAD, \ + .fs = &init_fs, \ + .files = &init_files, \ + .signal = &init_signals, \ + .sighand = &init_sighand, \ + .pending = { \ + .list = LIST_HEAD_INIT(tsk.pending.list), \ + .signal = {{0}}}, \ + .blocked = {{0}}, \ + .posix_timers = LIST_HEAD_INIT(tsk.posix_timers), \ + .alloc_lock = SPIN_LOCK_UNLOCKED, \ + .proc_lock = SPIN_LOCK_UNLOCKED, \ + .switch_lock = SPIN_LOCK_UNLOCKED, \ + .journal_info = NULL, \ +} + + + +#endif diff -Nru a/include/linux/ip.h b/include/linux/ip.h --- a/include/linux/ip.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/ip.h Fri Oct 31 14:10:53 2003 @@ -83,6 +83,7 @@ #include <linux/types.h> #include <net/sock.h> #include <linux/igmp.h> +#include <net/flow.h> struct ip_options { __u32 faddr; /* Saved first hop address */ @@ -141,6 +142,7 @@ struct rtable *rt; int length; /* Total length of all frames */ u32 addr; + struct flowi fl; } cork; }; diff -Nru a/include/linux/ipv6.h b/include/linux/ipv6.h --- a/include/linux/ipv6.h Fri Oct 31 14:10:54 2003 +++ b/include/linux/ipv6.h Fri Oct 31 14:10:54 2003 @@ -234,7 +234,6 @@ struct { struct ipv6_txoptions *opt; struct rt6_info *rt; - struct flowi *fl; int hop_limit; } cork; }; diff -Nru a/include/linux/jbd.h b/include/linux/jbd.h --- a/include/linux/jbd.h Fri Oct 31 14:10:54 2003 +++ b/include/linux/jbd.h Fri Oct 31 14:10:54 2003 @@ -384,6 +384,14 @@ struct handle_s { + /* Which journal this handle belongs to. This has to be first + * field, because current->fs_context points here. */ + journal_t * h_journal; + + /* Previous file system context. NULL if we are top-most + * call. */ + struct fs_activation * h_parent; + /* Which compound transaction is this update a part of? */ transaction_t *h_transaction; @@ -876,7 +884,7 @@ static inline handle_t *journal_current_handle(void) { - return current->journal_info; + return (handle_t*) current->fs_context; } /* The journaling code user interface: diff -Nru a/include/linux/jbd.h~fs_activation.diff b/include/linux/jbd.h~fs_activation.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/jbd.h~fs_activation.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,1118 @@ +/* + * linux/include/linux/jbd.h + * + * Written by Stephen C. Tweedie <sct@redhat.com> + * + * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Definitions for transaction data structures for the buffer cache + * filesystem journaling support. + */ + +#ifndef _LINUX_JBD_H +#define _LINUX_JBD_H + +#if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || !defined(__KERNEL__) + +/* Allow this file to be included directly into e2fsprogs */ +#ifndef __KERNEL__ +#include "jfs_compat.h" +#define JFS_DEBUG +#define jfs_debug jbd_debug +#else + +#include <linux/buffer_head.h> +#include <linux/journal-head.h> +#include <linux/stddef.h> +#include <asm/semaphore.h> +#endif + +#define journal_oom_retry 1 + +/* + * Define JBD_PARANIOD_IOFAIL to cause a kernel BUG() if ext3 finds + * certain classes of error which can occur due to failed IOs. Under + * normal use we want ext3 to continue after such errors, because + * hardware _can_ fail, but for debugging purposes when running tests on + * known-good hardware we may want to trap these errors. + */ +#undef JBD_PARANOID_IOFAIL + +#ifdef CONFIG_JBD_DEBUG +/* + * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal + * consistency checks. By default we don't do this unless + * CONFIG_JBD_DEBUG is on. + */ +#define JBD_EXPENSIVE_CHECKING +extern int journal_enable_debug; + +#define jbd_debug(n, f, a...) \ + do { \ + if ((n) <= journal_enable_debug) { \ + printk (KERN_DEBUG "(%s, %d): %s: ", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (f, ## a); \ + } \ + } while (0) +#else +#define jbd_debug(f, a...) /**/ +#endif + +extern void * __jbd_kmalloc (const char *where, size_t size, int flags, int retry); +#define jbd_kmalloc(size, flags) \ + __jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry) +#define jbd_rep_kmalloc(size, flags) \ + __jbd_kmalloc(__FUNCTION__, (size), (flags), 1) + +#define JFS_MIN_JOURNAL_BLOCKS 1024 + +#ifdef __KERNEL__ + +/** + * typedef handle_t - The handle_t type represents a single atomic update being performed by some process. + * + * All filesystem modifications made by the process go + * through this handle. Recursive operations (such as quota operations) + * are gathered into a single update. + * + * The buffer credits field is used to account for journaled buffers + * being modified by the running process. To ensure that there is + * enough log space for all outstanding operations, we need to limit the + * number of outstanding buffers possible at any time. When the + * operation completes, any buffer credits not used are credited back to + * the transaction, so that at all times we know how many buffers the + * outstanding updates on a transaction might possibly touch. + * + * This is an opaque datatype. + **/ +typedef struct handle_s handle_t; /* Atomic operation type */ + + +/** + * typedef journal_t - The journal_t maintains all of the journaling state information for a single filesystem. + * + * journal_t is linked to from the fs superblock structure. + * + * We use the journal_t to keep track of all outstanding transaction + * activity on the filesystem, and to manage the state of the log + * writing process. + * + * This is an opaque datatype. + **/ +typedef struct journal_s journal_t; /* Journal control structure */ +#endif + +/* + * Internal structures used by the logging mechanism: + */ + +#define JFS_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */ + +/* + * On-disk structures + */ + +/* + * Descriptor block types: + */ + +#define JFS_DESCRIPTOR_BLOCK 1 +#define JFS_COMMIT_BLOCK 2 +#define JFS_SUPERBLOCK_V1 3 +#define JFS_SUPERBLOCK_V2 4 +#define JFS_REVOKE_BLOCK 5 + +/* + * Standard header for all descriptor blocks: + */ +typedef struct journal_header_s +{ + __u32 h_magic; + __u32 h_blocktype; + __u32 h_sequence; +} journal_header_t; + + +/* + * The block tag: used to describe a single buffer in the journal + */ +typedef struct journal_block_tag_s +{ + __u32 t_blocknr; /* The on-disk block number */ + __u32 t_flags; /* See below */ +} journal_block_tag_t; + +/* + * The revoke descriptor: used on disk to describe a series of blocks to + * be revoked from the log + */ +typedef struct journal_revoke_header_s +{ + journal_header_t r_header; + int r_count; /* Count of bytes used in the block */ +} journal_revoke_header_t; + + +/* Definitions for the journal tag flags word: */ +#define JFS_FLAG_ESCAPE 1 /* on-disk block is escaped */ +#define JFS_FLAG_SAME_UUID 2 /* block has same uuid as previous */ +#define JFS_FLAG_DELETED 4 /* block deleted by this transaction */ +#define JFS_FLAG_LAST_TAG 8 /* last tag in this descriptor block */ + + +/* + * The journal superblock. All fields are in big-endian byte order. + */ +typedef struct journal_superblock_s +{ +/* 0x0000 */ + journal_header_t s_header; + +/* 0x000C */ + /* Static information describing the journal */ + __u32 s_blocksize; /* journal device blocksize */ + __u32 s_maxlen; /* total blocks in journal file */ + __u32 s_first; /* first block of log information */ + +/* 0x0018 */ + /* Dynamic information describing the current state of the log */ + __u32 s_sequence; /* first commit ID expected in log */ + __u32 s_start; /* blocknr of start of log */ + +/* 0x0020 */ + /* Error value, as set by journal_abort(). */ + __s32 s_errno; + +/* 0x0024 */ + /* Remaining fields are only valid in a version-2 superblock */ + __u32 s_feature_compat; /* compatible feature set */ + __u32 s_feature_incompat; /* incompatible feature set */ + __u32 s_feature_ro_compat; /* readonly-compatible feature set */ +/* 0x0030 */ + __u8 s_uuid[16]; /* 128-bit uuid for journal */ + +/* 0x0040 */ + __u32 s_nr_users; /* Nr of filesystems sharing log */ + + __u32 s_dynsuper; /* Blocknr of dynamic superblock copy*/ + +/* 0x0048 */ + __u32 s_max_transaction; /* Limit of journal blocks per trans.*/ + __u32 s_max_trans_data; /* Limit of data blocks per trans. */ + +/* 0x0050 */ + __u32 s_padding[44]; + +/* 0x0100 */ + __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ +/* 0x0400 */ +} journal_superblock_t; + +#define JFS_HAS_COMPAT_FEATURE(j,mask) \ + ((j)->j_format_version >= 2 && \ + ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask)))) +#define JFS_HAS_RO_COMPAT_FEATURE(j,mask) \ + ((j)->j_format_version >= 2 && \ + ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask)))) +#define JFS_HAS_INCOMPAT_FEATURE(j,mask) \ + ((j)->j_format_version >= 2 && \ + ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) + +#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 + +/* Features known to this kernel version: */ +#define JFS_KNOWN_COMPAT_FEATURES 0 +#define JFS_KNOWN_ROCOMPAT_FEATURES 0 +#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE + +#ifdef __KERNEL__ + +#include <linux/fs.h> +#include <linux/sched.h> +#include <asm/bug.h> + +#define JBD_ASSERTIONS +#ifdef JBD_ASSERTIONS +#define J_ASSERT(assert) \ +do { \ + if (!(assert)) { \ + printk (KERN_EMERG \ + "Assertion failure in %s() at %s:%d: \"%s\"\n", \ + __FUNCTION__, __FILE__, __LINE__, # assert); \ + BUG(); \ + } \ +} while (0) + +#if defined(CONFIG_BUFFER_DEBUG) +void buffer_assertion_failure(struct buffer_head *bh); +#define J_ASSERT_BH(bh, expr) \ + do { \ + if (!(expr)) \ + buffer_assertion_failure(bh); \ + J_ASSERT(expr); \ + } while (0) +#define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr) +#else +#define J_ASSERT_BH(bh, expr) J_ASSERT(expr) +#define J_ASSERT_JH(jh, expr) J_ASSERT(expr) +#endif + +#else +#define J_ASSERT(assert) do { } while (0) +#endif /* JBD_ASSERTIONS */ + +#if defined(JBD_PARANOID_IOFAIL) +#define J_EXPECT(expr, why...) J_ASSERT(expr) +#define J_EXPECT_BH(bh, expr, why...) J_ASSERT_BH(bh, expr) +#define J_EXPECT_JH(jh, expr, why...) J_ASSERT_JH(jh, expr) +#else +#define __journal_expect(expr, why...) \ + do { \ + if (!(expr)) { \ + printk(KERN_ERR \ + "EXT3-fs unexpected failure: %s;\n",# expr); \ + printk(KERN_ERR why); \ + } \ + } while (0) +#define J_EXPECT(expr, why...) __journal_expect(expr, ## why) +#define J_EXPECT_BH(bh, expr, why...) __journal_expect(expr, ## why) +#define J_EXPECT_JH(jh, expr, why...) __journal_expect(expr, ## why) +#endif + +enum jbd_state_bits { + BH_JBD /* Has an attached ext3 journal_head */ + = BH_PrivateStart, + BH_JWrite, /* Being written to log (@@@ DEBUGGING) */ + BH_Freed, /* Has been freed (truncated) */ + BH_Revoked, /* Has been revoked from the log */ + BH_RevokeValid, /* Revoked flag is valid */ + BH_JBDDirty, /* Is dirty but journaled */ + BH_State, /* Pins most journal_head state */ + BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ +}; + +BUFFER_FNS(JBD, jbd) +BUFFER_FNS(JWrite, jwrite) +BUFFER_FNS(JBDDirty, jbddirty) +TAS_BUFFER_FNS(JBDDirty, jbddirty) +BUFFER_FNS(Freed, freed) + +static inline struct buffer_head *jh2bh(struct journal_head *jh) +{ + return jh->b_bh; +} + +static inline struct journal_head *bh2jh(struct buffer_head *bh) +{ + return bh->b_private; +} + +static inline void jbd_lock_bh_state(struct buffer_head *bh) +{ + bit_spin_lock(BH_State, &bh->b_state); +} + +static inline int jbd_trylock_bh_state(struct buffer_head *bh) +{ + return bit_spin_trylock(BH_State, &bh->b_state); +} + +static inline int jbd_is_locked_bh_state(struct buffer_head *bh) +{ + return bit_spin_is_locked(BH_State, &bh->b_state); +} + +static inline void jbd_unlock_bh_state(struct buffer_head *bh) +{ + bit_spin_unlock(BH_State, &bh->b_state); +} + +static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) +{ + bit_spin_lock(BH_JournalHead, &bh->b_state); +} + +static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) +{ + bit_spin_unlock(BH_JournalHead, &bh->b_state); +} + +#define HAVE_JOURNAL_CALLBACK_STATUS +/** + * struct journal_callback - Base structure for callback information. + * @jcb_list: list information for other callbacks attached to the same handle. + * @jcb_func: Function to call with this callback structure. + * + * This struct is a 'seed' structure for a using with your own callback + * structs. If you are using callbacks you must allocate one of these + * or another struct of your own definition which has this struct + * as it's first element and pass it to journal_callback_set(). + * + * This is used internally by jbd to maintain callback information. + * + * See journal_callback_set for more information. + **/ +struct journal_callback { + struct list_head jcb_list; /* t_jcb_lock */ + void (*jcb_func)(struct journal_callback *jcb, int error); + /* user data goes here */ +}; + +struct jbd_revoke_table_s; + +/** + * struct handle_s - The handle_s type is the concrete type associated with + * handle_t. + * @h_transaction: Which compound transaction is this update a part of? + * @h_buffer_credits: Number of remaining buffers we are allowed to dirty. + * @h_ref: Reference count on this handle + * @h_jcb: List of application registered callbacks for this handle. + * @h_err: Field for caller's use to track errors through large fs operations + * @h_sync: flag for sync-on-close + * @h_jdata: flag to force data journaling + * @h_aborted: flag indicating fatal error on handle + **/ + +/* Docbook can't yet cope with the bit fields, but will leave the documentation + * in so it can be fixed later. + */ + +struct handle_s +{ + /* Which compound transaction is this update a part of? */ + transaction_t *h_transaction; + + /* Number of remaining buffers we are allowed to dirty: */ + int h_buffer_credits; + + /* Reference count on this handle */ + int h_ref; + + /* Field for caller's use to track errors through large fs */ + /* operations */ + int h_err; + + /* + * List of application registered callbacks for this handle. The + * function(s) will be called after the transaction that this handle is + * part of has been committed to disk. [t_jcb_lock] + */ + struct list_head h_jcb; + + /* Flags [no locking] */ + unsigned int h_sync: 1; /* sync-on-close */ + unsigned int h_jdata: 1; /* force data journaling */ + unsigned int h_aborted: 1; /* fatal error on handle */ +}; + + +/* The transaction_t type is the guts of the journaling mechanism. It + * tracks a compound transaction through its various states: + * + * RUNNING: accepting new updates + * LOCKED: Updates still running but we don't accept new ones + * RUNDOWN: Updates are tidying up but have finished requesting + * new buffers to modify (state not used for now) + * FLUSH: All updates complete, but we are still writing to disk + * COMMIT: All data on disk, writing commit record + * FINISHED: We still have to keep the transaction for checkpointing. + * + * The transaction keeps track of all of the buffers modified by a + * running transaction, and all of the buffers committed but not yet + * flushed to home for finished transactions. + */ + +/* + * Lock ranking: + * + * j_list_lock + * ->jbd_lock_bh_journal_head() (This is "innermost") + * + * j_state_lock + * ->jbd_lock_bh_state() + * + * jbd_lock_bh_state() + * ->j_list_lock + * + * j_state_lock + * ->t_handle_lock + * + * j_state_lock + * ->j_list_lock (journal_unmap_buffer) + * + * t_handle_lock + * ->t_jcb_lock + */ + +struct transaction_s +{ + /* Pointer to the journal for this transaction. [no locking] */ + journal_t *t_journal; + + /* Sequence number for this transaction [no locking] */ + tid_t t_tid; + + /* + * Transaction's current state + * [no locking - only kjournald alters this] + * FIXME: needs barriers + * KLUDGE: [use j_state_lock] + */ + enum { + T_RUNNING, + T_LOCKED, + T_RUNDOWN, + T_FLUSH, + T_COMMIT, + T_FINISHED + } t_state; + + /* + * Where in the log does this transaction's commit start? [no locking] + */ + unsigned long t_log_start; + + /* Number of buffers on the t_buffers list [j_list_lock] */ + int t_nr_buffers; + + /* + * Doubly-linked circular list of all buffers reserved but not yet + * modified by this transaction [j_list_lock] + */ + struct journal_head *t_reserved_list; + + /* + * Doubly-linked circular list of all metadata buffers owned by this + * transaction [j_list_lock] + */ + struct journal_head *t_buffers; + + /* + * Doubly-linked circular list of all data buffers still to be + * flushed before this transaction can be committed [j_list_lock] + */ + struct journal_head *t_sync_datalist; + + /* + * Doubly-linked circular list of all forget buffers (superseded + * buffers which we can un-checkpoint once this transaction commits) + * [j_list_lock] + */ + struct journal_head *t_forget; + + /* + * Doubly-linked circular list of all buffers still to be flushed before + * this transaction can be checkpointed. [j_list_lock] + */ + struct journal_head *t_checkpoint_list; + + /* + * Doubly-linked circular list of temporary buffers currently undergoing + * IO in the log [j_list_lock] + */ + struct journal_head *t_iobuf_list; + + /* + * Doubly-linked circular list of metadata buffers being shadowed by log + * IO. The IO buffers on the iobuf list and the shadow buffers on this + * list match each other one for one at all times. [j_list_lock] + */ + struct journal_head *t_shadow_list; + + /* + * Doubly-linked circular list of control buffers being written to the + * log. [j_list_lock] + */ + struct journal_head *t_log_list; + + /* + * Protects info related to handles + */ + spinlock_t t_handle_lock; + + /* + * Number of outstanding updates running on this transaction + * [t_handle_lock] + */ + int t_updates; + + /* + * Number of buffers reserved for use by all handles in this transaction + * handle but not yet modified. [t_handle_lock] + */ + int t_outstanding_credits; + + /* + * Forward and backward links for the circular list of all transactions + * awaiting checkpoint. [j_list_lock] + */ + transaction_t *t_cpnext, *t_cpprev; + + /* + * When will the transaction expire (become due for commit), in jiffies? + * [no locking] + */ + unsigned long t_expires; + + /* + * How many handles used this transaction? [t_handle_lock] + */ + int t_handle_count; + + /* + * Protects the callback list + */ + spinlock_t t_jcb_lock; + /* + * List of registered callback functions for this transaction. + * Called when the transaction is committed. [t_jcb_lock] + */ + struct list_head t_jcb; +}; + +/** + * struct journal_s - The journal_s type is the concrete type associated with + * journal_t. + * @j_flags: General journaling state flags + * @j_errno: Is there an outstanding uncleared error on the journal (from a + * prior abort)? + * @j_sb_buffer: First part of superblock buffer + * @j_superblock: Second part of superblock buffer + * @j_format_version: Version of the superblock format + * @j_barrier_count: Number of processes waiting to create a barrier lock + * @j_barrier: The barrier lock itself + * @j_running_transaction: The current running transaction.. + * @j_committing_transaction: the transaction we are pushing to disk + * @j_checkpoint_transactions: a linked circular list of all transactions + * waiting for checkpointing + * @j_wait_transaction_locked: Wait queue for waiting for a locked transaction + * to start committing, or for a barrier lock to be released + * @j_wait_logspace: Wait queue for waiting for checkpointing to complete + * @j_wait_done_commit: Wait queue for waiting for commit to complete + * @j_wait_checkpoint: Wait queue to trigger checkpointing + * @j_wait_commit: Wait queue to trigger commit + * @j_wait_updates: Wait queue to wait for updates to complete + * @j_checkpoint_sem: Semaphore for locking against concurrent checkpoints + * @j_head: Journal head - identifies the first unused block in the journal + * @j_tail: Journal tail - identifies the oldest still-used block in the + * journal. + * @j_free: Journal free - how many free blocks are there in the journal? + * @j_first: The block number of the first usable block + * @j_last: The block number one beyond the last usable block + * @j_dev: Device where we store the journal + * @j_blocksize: blocksize for the location where we store the journal. + * @j_blk_offset: starting block offset for into the device where we store the + * journal + * @j_fs_dev: Device which holds the client fs. For internal journal this will + * be equal to j_dev + * @j_maxlen: Total maximum capacity of the journal region on disk. + * @j_inode: Optional inode where we store the journal. If present, all journal + * block numbers are mapped into this inode via bmap(). + * @j_tail_sequence: Sequence number of the oldest transaction in the log + * @j_transaction_sequence: Sequence number of the next transaction to grant + * @j_commit_sequence: Sequence number of the most recently committed + * transaction + * @j_commit_request: Sequence number of the most recent transaction wanting + * commit + * @j_uuid: Uuid of client object. + * @j_task: Pointer to the current commit thread for this journal + * @j_max_transaction_buffers: Maximum number of metadata buffers to allow in a + * single compound commit transaction + * @j_commit_interval: What is the maximum transaction lifetime before we begin + * a commit? + * @j_commit_timer: The timer used to wakeup the commit thread + * @j_revoke: The revoke table - maintains the list of revoked blocks in the + * current transaction. + */ + +struct journal_s +{ + /* General journaling state flags [j_state_lock] */ + unsigned long j_flags; + + /* + * Is there an outstanding uncleared error on the journal (from a prior + * abort)? [j_state_lock] + */ + int j_errno; + + /* The superblock buffer */ + struct buffer_head *j_sb_buffer; + journal_superblock_t *j_superblock; + + /* Version of the superblock format */ + int j_format_version; + + /* + * Protect the various scalars in the journal + */ + spinlock_t j_state_lock; + + /* + * Number of processes waiting to create a barrier lock [j_state_lock] + */ + int j_barrier_count; + + /* The barrier lock itself */ + struct semaphore j_barrier; + + /* + * Transactions: The current running transaction... + * [j_state_lock] [caller holding open handle] + */ + transaction_t *j_running_transaction; + + /* + * the transaction we are pushing to disk + * [j_state_lock] [caller holding open handle] + */ + transaction_t *j_committing_transaction; + + /* + * ... and a linked circular list of all transactions waiting for + * checkpointing. [j_list_lock] + */ + transaction_t *j_checkpoint_transactions; + + /* + * Wait queue for waiting for a locked transaction to start committing, + * or for a barrier lock to be released + */ + wait_queue_head_t j_wait_transaction_locked; + + /* Wait queue for waiting for checkpointing to complete */ + wait_queue_head_t j_wait_logspace; + + /* Wait queue for waiting for commit to complete */ + wait_queue_head_t j_wait_done_commit; + + /* Wait queue to trigger checkpointing */ + wait_queue_head_t j_wait_checkpoint; + + /* Wait queue to trigger commit */ + wait_queue_head_t j_wait_commit; + + /* Wait queue to wait for updates to complete */ + wait_queue_head_t j_wait_updates; + + /* Semaphore for locking against concurrent checkpoints */ + struct semaphore j_checkpoint_sem; + + /* + * Journal head: identifies the first unused block in the journal. + * [j_state_lock] + */ + unsigned long j_head; + + /* + * Journal tail: identifies the oldest still-used block in the journal. + * [j_state_lock] + */ + unsigned long j_tail; + + /* + * Journal free: how many free blocks are there in the journal? + * [j_state_lock] + */ + unsigned long j_free; + + /* + * Journal start and end: the block numbers of the first usable block + * and one beyond the last usable block in the journal. [j_state_lock] + */ + unsigned long j_first; + unsigned long j_last; + + /* + * Device, blocksize and starting block offset for the location where we + * store the journal. + */ + struct block_device *j_dev; + int j_blocksize; + unsigned int j_blk_offset; + + /* + * Device which holds the client fs. For internal journal this will be + * equal to j_dev. + */ + struct block_device *j_fs_dev; + + /* Total maximum capacity of the journal region on disk. */ + unsigned int j_maxlen; + + /* + * Protects the buffer lists and internal buffer state. + */ + spinlock_t j_list_lock; + + /* Optional inode where we store the journal. If present, all */ + /* journal block numbers are mapped into this inode via */ + /* bmap(). */ + struct inode *j_inode; + + /* + * Sequence number of the oldest transaction in the log [j_state_lock] + */ + tid_t j_tail_sequence; + + /* + * Sequence number of the next transaction to grant [j_state_lock] + */ + tid_t j_transaction_sequence; + + /* + * Sequence number of the most recently committed transaction + * [j_state_lock]. + */ + tid_t j_commit_sequence; + + /* + * Sequence number of the most recent transaction wanting commit + * [j_state_lock] + */ + tid_t j_commit_request; + + /* + * Journal uuid: identifies the object (filesystem, LVM volume etc) + * backed by this journal. This will eventually be replaced by an array + * of uuids, allowing us to index multiple devices within a single + * journal and to perform atomic updates across them. + */ + __u8 j_uuid[16]; + + /* Pointer to the current commit thread for this journal */ + struct task_struct *j_task; + + /* + * Maximum number of metadata buffers to allow in a single compound + * commit transaction + */ + int j_max_transaction_buffers; + + /* + * What is the maximum transaction lifetime before we begin a commit? + */ + unsigned long j_commit_interval; + + /* The timer used to wakeup the commit thread: */ + struct timer_list *j_commit_timer; + + /* + * The revoke table: maintains the list of revoked blocks in the + * current transaction. [j_revoke_lock] + */ + spinlock_t j_revoke_lock; + struct jbd_revoke_table_s *j_revoke; + struct jbd_revoke_table_s *j_revoke_table[2]; + + /* + * An opaque pointer to fs-private information. ext3 puts its + * superblock pointer here + */ + void *j_private; +}; + +/* + * Journal flag definitions + */ +#define JFS_UNMOUNT 0x001 /* Journal thread is being destroyed */ +#define JFS_ABORT 0x002 /* Journaling has been aborted for errors. */ +#define JFS_ACK_ERR 0x004 /* The errno in the sb has been acked */ +#define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */ +#define JFS_LOADED 0x010 /* The journal superblock has been loaded */ + +/* + * Function declarations for the journaling transaction and buffer + * management + */ + +/* Filing buffers */ +extern void journal_unfile_buffer(journal_t *, struct journal_head *); +extern void __journal_unfile_buffer(struct journal_head *); +extern void __journal_refile_buffer(struct journal_head *); +extern void journal_refile_buffer(journal_t *, struct journal_head *); +extern void __journal_file_buffer(struct journal_head *, transaction_t *, int); +extern void __journal_free_buffer(struct journal_head *bh); +extern void journal_file_buffer(struct journal_head *, transaction_t *, int); +extern void __journal_clean_data_list(transaction_t *transaction); + +/* Log buffer allocation */ +extern struct journal_head * journal_get_descriptor_buffer(journal_t *); +int journal_next_log_block(journal_t *, unsigned long *); + +/* Commit management */ +extern void journal_commit_transaction(journal_t *); + +/* Checkpoint list management */ +int __journal_clean_checkpoint_list(journal_t *journal); +void __journal_remove_checkpoint(struct journal_head *); +void __journal_insert_checkpoint(struct journal_head *, transaction_t *); + +/* Buffer IO */ +extern int +journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct journal_head **jh_out, + int blocknr); + +/* Transaction locking */ +extern void __wait_on_journal (journal_t *); + +/* + * Journal locking. + * + * We need to lock the journal during transaction state changes so that nobody + * ever tries to take a handle on the running transaction while we are in the + * middle of moving it to the commit phase. j_state_lock does this. + * + * Note that the locking is completely interrupt unsafe. We never touch + * journal structures from interrupts. + */ + +static inline handle_t *journal_current_handle(void) +{ + return current->journal_info; +} + +/* The journaling code user interface: + * + * Create and destroy handles + * Register buffer modifications against the current transaction. + */ + +extern handle_t *journal_start(journal_t *, int nblocks); +extern int journal_restart (handle_t *, int nblocks); +extern int journal_extend (handle_t *, int nblocks); +extern int journal_get_write_access(handle_t *, struct buffer_head *, + int *credits); +extern int journal_get_create_access (handle_t *, struct buffer_head *); +extern int journal_get_undo_access(handle_t *, struct buffer_head *, + int *credits); +extern int journal_dirty_data (handle_t *, struct buffer_head *); +extern int journal_dirty_metadata (handle_t *, struct buffer_head *); +extern void journal_release_buffer (handle_t *, struct buffer_head *, + int credits); +extern void journal_forget (handle_t *, struct buffer_head *); +extern void journal_sync_buffer (struct buffer_head *); +extern int journal_invalidatepage(journal_t *, + struct page *, unsigned long); +extern int journal_try_to_free_buffers(journal_t *, struct page *, int); +extern int journal_stop(handle_t *); +extern int journal_flush (journal_t *); +extern void journal_callback_set(handle_t *handle, + void (*fn)(struct journal_callback *,int), + struct journal_callback *jcb); + +extern void journal_lock_updates (journal_t *); +extern void journal_unlock_updates (journal_t *); + +extern journal_t * journal_init_dev(struct block_device *bdev, + struct block_device *fs_dev, + int start, int len, int bsize); +extern journal_t * journal_init_inode (struct inode *); +extern int journal_update_format (journal_t *); +extern int journal_check_used_features + (journal_t *, unsigned long, unsigned long, unsigned long); +extern int journal_check_available_features + (journal_t *, unsigned long, unsigned long, unsigned long); +extern int journal_set_features + (journal_t *, unsigned long, unsigned long, unsigned long); +extern int journal_create (journal_t *); +extern int journal_load (journal_t *journal); +extern void journal_destroy (journal_t *); +extern int journal_recover (journal_t *journal); +extern int journal_wipe (journal_t *, int); +extern int journal_skip_recovery (journal_t *); +extern void journal_update_superblock (journal_t *, int); +extern void __journal_abort_hard (journal_t *); +extern void __journal_abort_soft (journal_t *, int); +extern void journal_abort (journal_t *, int); +extern int journal_errno (journal_t *); +extern void journal_ack_err (journal_t *); +extern int journal_clear_err (journal_t *); +extern int journal_bmap(journal_t *, unsigned long, unsigned long *); +extern int journal_force_commit(journal_t *); + +/* + * journal_head management + */ +struct journal_head *journal_add_journal_head(struct buffer_head *bh); +struct journal_head *journal_grab_journal_head(struct buffer_head *bh); +void journal_remove_journal_head(struct buffer_head *bh); +void journal_put_journal_head(struct journal_head *jh); + +/* + * handle management + */ +extern kmem_cache_t *jbd_handle_cache; + +static inline handle_t *jbd_alloc_handle(int gfp_flags) +{ + return kmem_cache_alloc(jbd_handle_cache, gfp_flags); +} + +static inline void jbd_free_handle(handle_t *handle) +{ + kmem_cache_free(jbd_handle_cache, handle); +} + +/* Primary revoke support */ +#define JOURNAL_REVOKE_DEFAULT_HASH 256 +extern int journal_init_revoke(journal_t *, int); +extern void journal_destroy_revoke_caches(void); +extern int journal_init_revoke_caches(void); + +extern void journal_destroy_revoke(journal_t *); +extern int journal_revoke (handle_t *, + unsigned long, struct buffer_head *); +extern int journal_cancel_revoke(handle_t *, struct journal_head *); +extern void journal_write_revoke_records(journal_t *, transaction_t *); + +/* Recovery revoke support */ +extern int journal_set_revoke(journal_t *, unsigned long, tid_t); +extern int journal_test_revoke(journal_t *, unsigned long, tid_t); +extern void journal_clear_revoke(journal_t *); +extern void journal_brelse_array(struct buffer_head *b[], int n); +extern void journal_switch_revoke_table(journal_t *journal); + +/* + * The log thread user interface: + * + * Request space in the current transaction, and force transaction commit + * transitions on demand. + */ + +int __log_space_left(journal_t *); /* Called with journal locked */ +int log_start_commit(journal_t *journal, tid_t tid); +int __log_start_commit(journal_t *journal, tid_t tid); +int journal_start_commit(journal_t *journal, tid_t *tid); +int log_wait_commit(journal_t *journal, tid_t tid); +int log_do_checkpoint(journal_t *journal); + +void __log_wait_for_space(journal_t *journal); +extern void __journal_drop_transaction(journal_t *, transaction_t *); +extern int cleanup_journal_tail(journal_t *); + +/* Debugging code only: */ + +#define jbd_ENOSYS() \ +do { \ + printk (KERN_ERR "JBD unimplemented function " __FUNCTION__); \ + current->state = TASK_UNINTERRUPTIBLE; \ + schedule(); \ +} while (1) + +/* + * is_journal_abort + * + * Simple test wrapper function to test the JFS_ABORT state flag. This + * bit, when set, indicates that we have had a fatal error somewhere, + * either inside the journaling layer or indicated to us by the client + * (eg. ext3), and that we and should not commit any further + * transactions. + */ + +static inline int is_journal_aborted(journal_t *journal) +{ + return journal->j_flags & JFS_ABORT; +} + +static inline int is_handle_aborted(handle_t *handle) +{ + if (handle->h_aborted) + return 1; + return is_journal_aborted(handle->h_transaction->t_journal); +} + +static inline void journal_abort_handle(handle_t *handle) +{ + handle->h_aborted = 1; +} + +#endif /* __KERNEL__ */ + +/* Comparison functions for transaction IDs: perform comparisons using + * modulo arithmetic so that they work over sequence number wraps. */ + +static inline int tid_gt(tid_t x, tid_t y) +{ + int difference = (x - y); + return (difference > 0); +} + +static inline int tid_geq(tid_t x, tid_t y) +{ + int difference = (x - y); + return (difference >= 0); +} + +extern int journal_blocks_per_page(struct inode *inode); + +/* + * Return the minimum number of blocks which must be free in the journal + * before a new transaction may be started. Must be called under j_state_lock. + */ +static inline int jbd_space_needed(journal_t *journal) +{ + int nblocks = journal->j_max_transaction_buffers; + if (journal->j_committing_transaction) + nblocks += journal->j_committing_transaction-> + t_outstanding_credits; + return nblocks; +} + +/* + * Definitions which augment the buffer_head layer + */ + +/* journaling buffer types */ +#define BJ_None 0 /* Not journaled */ +#define BJ_SyncData 1 /* Normal data: flush before commit */ +#define BJ_Metadata 2 /* Normal journaled metadata */ +#define BJ_Forget 3 /* Buffer superseded by this transaction */ +#define BJ_IO 4 /* Buffer is for temporary IO use */ +#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */ +#define BJ_LogCtl 6 /* Buffer contains log descriptors */ +#define BJ_Reserved 7 /* Buffer is reserved for access by journal */ +#define BJ_Types 8 + +extern int jbd_blocks_per_page(struct inode *inode); + +#ifdef __KERNEL__ + +#ifdef CONFIG_SMP +#define assert_spin_locked(lock) J_ASSERT(spin_is_locked(lock)) +#else +#define assert_spin_locked(lock) do {} while(0) +#endif + +#define buffer_trace_init(bh) do {} while (0) +#define print_buffer_fields(bh) do {} while (0) +#define print_buffer_trace(bh) do {} while (0) +#define BUFFER_TRACE(bh, info) do {} while (0) +#define BUFFER_TRACE2(bh, bh2, info) do {} while (0) +#define JBUFFER_TRACE(jh, info) do {} while (0) + +#endif /* __KERNEL__ */ + +#endif /* CONFIG_JBD || CONFIG_JBD_MODULE || !__KERNEL__ */ + +/* + * Compatibility no-ops which allow the kernel to compile without CONFIG_JBD + * go here. + */ + +#if defined(__KERNEL__) && !(defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)) + +#define J_ASSERT(expr) do {} while (0) +#define J_ASSERT_BH(bh, expr) do {} while (0) +#define buffer_jbd(bh) 0 +#define journal_buffer_journal_lru(bh) 0 + +#endif /* defined(__KERNEL__) && !defined(CONFIG_JBD) */ +#endif /* _LINUX_JBD_H */ diff -Nru a/include/linux/llc.h b/include/linux/llc.h --- a/include/linux/llc.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/llc.h Fri Oct 31 14:10:53 2003 @@ -12,20 +12,17 @@ * * See the GNU General Public License for more details. */ -#define __LLC_SOCK_SIZE__ 28 /* sizeof(sockaddr_llc), word align. */ +#define __LLC_SOCK_SIZE__ 16 /* sizeof(sockaddr_llc), word align. */ struct sockaddr_llc { sa_family_t sllc_family; /* AF_LLC */ sa_family_t sllc_arphrd; /* ARPHRD_ETHER */ unsigned char sllc_test; unsigned char sllc_xid; unsigned char sllc_ua; /* UA data, only for SOCK_STREAM. */ - unsigned char sllc_dsap; - unsigned char sllc_ssap; - unsigned char sllc_dmac[IFHWADDRLEN]; - unsigned char sllc_smac[IFHWADDRLEN]; - unsigned char sllc_mmac[IFHWADDRLEN]; + unsigned char sllc_sap; + unsigned char sllc_mac[IFHWADDRLEN]; unsigned char __pad[__LLC_SOCK_SIZE__ - sizeof(sa_family_t) * 2 - - sizeof(unsigned char) * 5 - IFHWADDRLEN * 3]; + sizeof(unsigned char) * 4 - IFHWADDRLEN]; }; /* sockopt definitions. */ diff -Nru a/include/linux/mm.h b/include/linux/mm.h --- a/include/linux/mm.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/mm.h Fri Oct 31 14:10:53 2003 @@ -197,6 +197,9 @@ void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_REISER4_DEBUG + void *owner; +#endif }; /* @@ -487,6 +490,9 @@ return __set_page_dirty_buffers(page); } +extern long do_mprotect(struct mm_struct *mm, unsigned long start, + size_t len, unsigned long prot); + /* * On a two-level page table, this ends up being trivial. Thus the * inlining and the symmetry break with pte_alloc_map() that does all @@ -517,9 +523,10 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); -extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff); +extern unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flag, + unsigned long pgoff); static inline unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, @@ -529,7 +536,8 @@ if ((offset + PAGE_ALIGN(len)) < offset) goto out; if (!(offset & ~PAGE_MASK)) - ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); + ret = do_mmap_pgoff(current->mm, file, addr, len, prot, flag, + offset >> PAGE_SHIFT); out: return ret; } @@ -561,6 +569,8 @@ /* filemap.c */ extern unsigned long page_unuse(struct page *); extern void truncate_inode_pages(struct address_space *, loff_t); +extern void truncate_mapping_pages_range(struct address_space *mapping, + pgoff_t lstart, long count); /* generic vm_area_ops exported for stackable file systems */ extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); diff -Nru a/include/linux/mm.h~do_mmap2-fix.diff b/include/linux/mm.h~do_mmap2-fix.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/mm.h~do_mmap2-fix.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,626 @@ +#ifndef _LINUX_MM_H +#define _LINUX_MM_H + +#include <linux/sched.h> +#include <linux/errno.h> + +#ifdef __KERNEL__ + +#include <linux/config.h> +#include <linux/gfp.h> +#include <linux/list.h> +#include <linux/mmzone.h> +#include <linux/rbtree.h> +#include <linux/fs.h> + +#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ +extern unsigned long max_mapnr; +#endif + +extern unsigned long num_physpages; +extern void * high_memory; +extern int page_cluster; + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/atomic.h> + +#ifndef MM_VM_SIZE +#define MM_VM_SIZE(mm) TASK_SIZE +#endif + +/* + * Linux kernel virtual memory manager primitives. + * The idea being to have a "virtual" mm in the same way + * we have a virtual fs - giving a cleaner interface to the + * mm details, and allowing different kinds of memory mappings + * (from shared memory to executable loading to arbitrary + * mmap() functions). + */ + +/* + * This struct defines a memory VMM memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory + * space that has a special rule for the page-fault handlers (ie a shared + * library, the executable area etc). + * + * This structure is exactly 64 bytes on ia32. Please think very, very hard + * before adding anything to it. + */ +struct vm_area_struct { + struct mm_struct * vm_mm; /* The address space we belong to. */ + unsigned long vm_start; /* Our start address within vm_mm. */ + unsigned long vm_end; /* The first byte after our end address + within vm_mm. */ + + /* linked list of VM areas per task, sorted by address */ + struct vm_area_struct *vm_next; + + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + unsigned long vm_flags; /* Flags, listed below. */ + + struct rb_node vm_rb; + + /* + * For areas with an address space and backing store, + * one of the address_space->i_mmap{,shared} lists, + * for shm areas, the list of attaches, otherwise unused. + */ + struct list_head shared; + + /* Function pointers to deal with this struct. */ + struct vm_operations_struct * vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units, *not* PAGE_CACHE_SIZE */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ +}; + +/* + * vm_flags.. + */ +#define VM_READ 0x00000001 /* currently active flags */ +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 + +#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ +#define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 +#define VM_MAYSHARE 0x00000080 + +#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define VM_GROWSUP 0x00000200 +#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ +#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ + +#define VM_EXECUTABLE 0x00001000 +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ + + /* Used by sys_madvise() */ +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ + +#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ +#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ +#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#ifdef CONFIG_STACK_GROWSUP +#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#else +#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#endif + +#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) +#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK +#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) +#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) +#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) + +/* + * mapping from the currently active vm_flags protection bits (the + * low four bits) to a page protection mask.. + */ +extern pgprot_t protection_map[16]; + + +/* + * These are the virtual MM functions - opening of an area, closing and + * unmapping it (needed to keep files on disk up-to-date etc), pointer + * to the functions called when a no-page or a wp-page exception occurs. + */ +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct * area); + struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); + int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); +}; + +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; +struct mmu_gather; +struct inode; + +/* + * Each physical page in the system has a struct page associated with + * it to keep track of whatever it is we are using the page for at the + * moment. Note that we have no way to track which tasks are using + * a page. + * + * Try to keep the most commonly accessed fields in single cache lines + * here (16 bytes or greater). This ordering should be particularly + * beneficial on 32-bit processors. + * + * The first line is data used in page cache lookup, the second line + * is used for linear searches (eg. clock algorithm scans). + * + * TODO: make this structure smaller, it could be as small as 32 bytes. + */ +struct page { + unsigned long flags; /* atomic flags, some possibly + updated asynchronously */ + atomic_t count; /* Usage count, see below. */ + struct list_head list; /* ->mapping has some page lists. */ + struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long index; /* Our offset within mapping. */ + struct list_head lru; /* Pageout list, eg. active_list; + protected by zone->lru_lock !! */ + union { + struct pte_chain *chain;/* Reverse pte mapping pointer. + * protected by PG_chainlock */ + pte_addr_t direct; + } pte; + unsigned long private; /* mapping-private opaque data */ + + /* + * On machines where all RAM is mapped into kernel address space, + * we can simply calculate the virtual address. On machines with + * highmem some memory is mapped into kernel virtual memory + * dynamically, so we need a place to store that address. + * Note that this field could be 16 bits on x86 ... ;) + * + * Architectures with slow multiplication can define + * WANT_PAGE_VIRTUAL in asm/page.h + */ +#if defined(WANT_PAGE_VIRTUAL) + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ +#endif /* WANT_PAGE_VIRTUAL */ +}; + +/* + * FIXME: take this include out, include page-flags.h in + * files which need it (119 of them) + */ +#include <linux/page-flags.h> + +/* + * Methods to modify the page usage count. + * + * What counts for a page usage: + * - cache mapping (page->mapping) + * - private data (page->private) + * - page mapped in a task's page tables, each mapping + * is counted separately + * + * Also, many kernel routines increase the page count before a critical + * routine so they can be sure the page doesn't go away from under them. + */ +#define put_page_testzero(p) \ + ({ \ + BUG_ON(page_count(p) == 0); \ + atomic_dec_and_test(&(p)->count); \ + }) + +#define page_count(p) atomic_read(&(p)->count) +#define set_page_count(p,v) atomic_set(&(p)->count, v) +#define __put_page(p) atomic_dec(&(p)->count) + +extern void FASTCALL(__page_cache_release(struct page *)); + +#ifdef CONFIG_HUGETLB_PAGE + +static inline void get_page(struct page *page) +{ + if (PageCompound(page)) + page = (struct page *)page->lru.next; + atomic_inc(&page->count); +} + +static inline void put_page(struct page *page) +{ + if (PageCompound(page)) { + page = (struct page *)page->lru.next; + if (put_page_testzero(page)) { + if (page->lru.prev) { /* destructor? */ + (*(void (*)(struct page *))page->lru.prev)(page); + } else { + __page_cache_release(page); + } + } + return; + } + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#else /* CONFIG_HUGETLB_PAGE */ + +static inline void get_page(struct page *page) +{ + atomic_inc(&page->count); +} + +static inline void put_page(struct page *page) +{ + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Multiple processes may "see" the same page. E.g. for untouched + * mappings of /dev/null, all processes see the same page full of + * zeroes, and text pages of executables and shared libraries have + * only one copy in memory, at most, normally. + * + * For the non-reserved pages, page->count denotes a reference count. + * page->count == 0 means the page is free. + * page->count == 1 means the page is used for exactly one purpose + * (e.g. a private data page of one process). + * + * A page may be used for kmalloc() or anyone else who does a + * __get_free_page(). In this case the page->count is at least 1, and + * all other fields are unused but should be 0 or NULL. The + * management of this page is the responsibility of the one who uses + * it. + * + * The other pages (we may call them "process pages") are completely + * managed by the Linux memory manager: I/O, buffers, swapping etc. + * The following discussion applies only to them. + * + * A page may belong to an inode's memory mapping. In this case, + * page->mapping is the pointer to the inode, and page->index is the + * file offset of the page, in units of PAGE_CACHE_SIZE. + * + * A page contains an opaque `private' member, which belongs to the + * page's address_space. Usually, this is the address of a circular + * list of the page's disk buffers. + * + * For pages belonging to inodes, the page->count is the number of + * attaches, plus 1 if `private' contains something, plus one for + * the page cache itself. + * + * All pages belonging to an inode are in these doubly linked lists: + * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; + * using the page->list list_head. These fields are also used for + * freelist managemet (when page->count==0). + * + * There is also a per-mapping radix tree mapping index to the page + * in memory if present. The tree is rooted at mapping->root. + * + * All process pages can do I/O: + * - inode pages may need to be read from disk, + * - inode pages which have been modified and are MAP_SHARED may need + * to be written to disk, + * - private pages which have been modified may need to be swapped out + * to swap space and (later) to be read back into memory. + */ + +/* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + */ +#define ZONE_SHIFT (BITS_PER_LONG - 8) + +struct zone; +extern struct zone *zone_table[]; + +static inline struct zone *page_zone(struct page *page) +{ + return zone_table[page->flags >> ZONE_SHIFT]; +} + +static inline void set_page_zone(struct page *page, unsigned long zone_num) +{ + page->flags &= ~(~0UL << ZONE_SHIFT); + page->flags |= zone_num << ZONE_SHIFT; +} + +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) +{ + return __va(page_to_pfn(page) << PAGE_SHIFT); +} + +#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) +#define HASHED_PAGE_VIRTUAL +#endif + +#if defined(WANT_PAGE_VIRTUAL) +#define page_address(page) ((page)->virtual) +#define set_page_address(page, address) \ + do { \ + (page)->virtual = (address); \ + } while(0) +#define page_address_init() do { } while(0) +#endif + +#if defined(HASHED_PAGE_VIRTUAL) +void *page_address(struct page *page); +void set_page_address(struct page *page, void *virtual); +void page_address_init(void); +#endif + +#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) +#define page_address(page) lowmem_page_address(page) +#define set_page_address(page, address) do { } while(0) +#define page_address_init() do { } while(0) +#endif + +/* + * Return true if this page is mapped into pagetables. Subtle: test pte.direct + * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain + * is only 32-bit. + */ +static inline int page_mapped(struct page *page) +{ + return page->pte.direct != 0; +} + +/* + * Error return values for the *_nopage functions + */ +#define NOPAGE_SIGBUS (NULL) +#define NOPAGE_OOM ((struct page *) (-1)) + +/* + * Different kinds of faults, as returned by handle_mm_fault(). + * Used to decide whether a process gets delivered SIGBUS or + * just gets major/minor fault counters bumped up. + */ +#define VM_FAULT_OOM (-1) +#define VM_FAULT_SIGBUS 0 +#define VM_FAULT_MINOR 1 +#define VM_FAULT_MAJOR 2 + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +extern void show_free_areas(void); + +struct page *shmem_nopage(struct vm_area_struct * vma, + unsigned long address, int unused); +struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); +void shmem_lock(struct file * file, int lock); +int shmem_zero_setup(struct vm_area_struct *); + +void zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size); +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, + struct vm_area_struct *start_vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted); +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long address, unsigned long size); +void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma); +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long size, pgprot_t prot); + +extern void invalidate_mmap_range(struct address_space *mapping, + loff_t const holebegin, + loff_t const holelen); +extern int vmtruncate(struct inode * inode, loff_t offset); +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); +extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); +extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); +extern int make_pages_present(unsigned long addr, unsigned long end); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); +extern long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); +void put_dirty_page(struct task_struct *tsk, struct page *page, + unsigned long address, pgprot_t prot); + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + +int __set_page_dirty_buffers(struct page *page); +int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty_lock(struct page *page); + +/* + * Prototype to add a shrinker callback for ageable caches. + * + * These functions are passed a count `nr_to_scan' and a gfpmask. They should + * scan `nr_to_scan' objects, attempting to free them. + * + * The callback must the number of objects which remain in the cache. + * + * The callback will be passes nr_to_scan == 0 when the VM is querying the + * cache size, so a fastpath for that case is appropriate. + */ +typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); + +/* + * Add an aging callback. The int is the number of 'seeks' it takes + * to recreate one of the objects that these functions age. + */ + +#define DEFAULT_SEEKS 2 +struct shrinker; +extern struct shrinker *set_shrinker(int, shrinker_t); +extern void remove_shrinker(struct shrinker *shrinker); + +/* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + * FIXME: make the method unconditional. + */ +static inline int set_page_dirty(struct page *page) +{ + if (page->mapping) { + int (*spd)(struct page *); + + spd = page->mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + } + return __set_page_dirty_buffers(page); +} + +/* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc_map() that does all + * of this out-of-line. + */ +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + if (pgd_none(*pgd)) + return __pmd_alloc(mm, pgd, address); + return pmd_offset(pgd, address); +} + +extern void free_area_init(unsigned long * zones_size); +extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, + unsigned long * zones_size, unsigned long zone_start_pfn, + unsigned long *zholes_size); +extern void memmap_init_zone(struct page *, unsigned long, int, + unsigned long, unsigned long); +extern void mem_init(void); +extern void show_mem(void); +extern void si_meminfo(struct sysinfo * val); +extern void si_meminfo_node(struct sysinfo *val, int nid); + +/* mmap.c */ +extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); +extern void build_mmap_rb(struct mm_struct *); +extern void exit_mmap(struct mm_struct *); + +extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long pgoff); + +static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + unsigned long ret = -EINVAL; + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) + ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); +out: + return ret; +} + +extern int do_munmap(struct mm_struct *, unsigned long, size_t); + +extern unsigned long do_brk(unsigned long, unsigned long); + +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + +static inline int +can_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) +{ +#ifdef CONFIG_MMU + if (!vma->vm_file && vma->vm_flags == vm_flags) + return 1; +#endif + return 0; +} + +/* filemap.c */ +extern unsigned long page_unuse(struct page *); +extern void truncate_inode_pages(struct address_space *, loff_t); + +/* generic vm_area_ops exported for stackable file systems */ +extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); + +/* mm/page-writeback.c */ +int write_one_page(struct page *page, int wait); + +/* readahead.c */ +#define VM_MAX_READAHEAD 128 /* kbytes */ +#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ + +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +void page_cache_readahead(struct address_space *mapping, + struct file_ra_state *ra, + struct file *filp, + unsigned long offset); +void handle_ra_miss(struct address_space *mapping, + struct file_ra_state *ra, pgoff_t offset); +unsigned long max_sane_readahead(unsigned long nr); + +/* Do stack extension */ +extern int expand_stack(struct vm_area_struct * vma, unsigned long address); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev); +extern int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long addr, int new_below); + +/* Look up the first VMA which intersects the interval start_addr..end_addr-1, + NULL if none. Assume start_addr < end_addr. */ +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +{ + struct vm_area_struct * vma = find_vma(mm,start_addr); + + if (vma && end_addr <= vma->vm_start) + vma = NULL; + return vma; +} + +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); + +extern unsigned int nr_used_zone_pages(void); + +extern struct page * vmalloc_to_page(void *addr); +extern struct page * follow_page(struct mm_struct *mm, unsigned long address, + int write); +extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long to, unsigned long size, pgprot_t prot); + +#ifndef CONFIG_DEBUG_PAGEALLOC +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ +} +#endif + +#endif /* __KERNEL__ */ +#endif /* _LINUX_MM_H */ diff -Nru a/include/linux/mm.h~page-owner.diff b/include/linux/mm.h~page-owner.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/mm.h~page-owner.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,633 @@ +#ifndef _LINUX_MM_H +#define _LINUX_MM_H + +#include <linux/sched.h> +#include <linux/errno.h> + +#ifdef __KERNEL__ + +#include <linux/config.h> +#include <linux/gfp.h> +#include <linux/list.h> +#include <linux/mmzone.h> +#include <linux/rbtree.h> +#include <linux/fs.h> + +#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ +extern unsigned long max_mapnr; +#endif + +extern unsigned long num_physpages; +extern void * high_memory; +extern int page_cluster; + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/atomic.h> + +#ifndef MM_VM_SIZE +#define MM_VM_SIZE(mm) TASK_SIZE +#endif + +/* + * Linux kernel virtual memory manager primitives. + * The idea being to have a "virtual" mm in the same way + * we have a virtual fs - giving a cleaner interface to the + * mm details, and allowing different kinds of memory mappings + * (from shared memory to executable loading to arbitrary + * mmap() functions). + */ + +/* + * This struct defines a memory VMM memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory + * space that has a special rule for the page-fault handlers (ie a shared + * library, the executable area etc). + * + * This structure is exactly 64 bytes on ia32. Please think very, very hard + * before adding anything to it. + */ +struct vm_area_struct { + struct mm_struct * vm_mm; /* The address space we belong to. */ + unsigned long vm_start; /* Our start address within vm_mm. */ + unsigned long vm_end; /* The first byte after our end address + within vm_mm. */ + + /* linked list of VM areas per task, sorted by address */ + struct vm_area_struct *vm_next; + + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + unsigned long vm_flags; /* Flags, listed below. */ + + struct rb_node vm_rb; + + /* + * For areas with an address space and backing store, + * one of the address_space->i_mmap{,shared} lists, + * for shm areas, the list of attaches, otherwise unused. + */ + struct list_head shared; + + /* Function pointers to deal with this struct. */ + struct vm_operations_struct * vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units, *not* PAGE_CACHE_SIZE */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ +}; + +/* + * vm_flags.. + */ +#define VM_READ 0x00000001 /* currently active flags */ +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 + +#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ +#define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 +#define VM_MAYSHARE 0x00000080 + +#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define VM_GROWSUP 0x00000200 +#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ +#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ + +#define VM_EXECUTABLE 0x00001000 +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ + + /* Used by sys_madvise() */ +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ + +#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ +#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ +#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#ifdef CONFIG_STACK_GROWSUP +#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#else +#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#endif + +#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) +#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK +#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) +#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) +#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) + +/* + * mapping from the currently active vm_flags protection bits (the + * low four bits) to a page protection mask.. + */ +extern pgprot_t protection_map[16]; + + +/* + * These are the virtual MM functions - opening of an area, closing and + * unmapping it (needed to keep files on disk up-to-date etc), pointer + * to the functions called when a no-page or a wp-page exception occurs. + */ +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct * area); + struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); + int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); +}; + +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; +struct mmu_gather; +struct inode; + +/* + * Each physical page in the system has a struct page associated with + * it to keep track of whatever it is we are using the page for at the + * moment. Note that we have no way to track which tasks are using + * a page. + * + * Try to keep the most commonly accessed fields in single cache lines + * here (16 bytes or greater). This ordering should be particularly + * beneficial on 32-bit processors. + * + * The first line is data used in page cache lookup, the second line + * is used for linear searches (eg. clock algorithm scans). + * + * TODO: make this structure smaller, it could be as small as 32 bytes. + */ +struct page { + unsigned long flags; /* atomic flags, some possibly + updated asynchronously */ + atomic_t count; /* Usage count, see below. */ + struct list_head list; /* ->mapping has some page lists. */ + struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long index; /* Our offset within mapping. */ + struct list_head lru; /* Pageout list, eg. active_list; + protected by zone->lru_lock !! */ + union { + struct pte_chain *chain;/* Reverse pte mapping pointer. + * protected by PG_chainlock */ + pte_addr_t direct; + } pte; + unsigned long private; /* mapping-private opaque data */ + + /* + * On machines where all RAM is mapped into kernel address space, + * we can simply calculate the virtual address. On machines with + * highmem some memory is mapped into kernel virtual memory + * dynamically, so we need a place to store that address. + * Note that this field could be 16 bits on x86 ... ;) + * + * Architectures with slow multiplication can define + * WANT_PAGE_VIRTUAL in asm/page.h + */ +#if defined(WANT_PAGE_VIRTUAL) + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ +#endif /* WANT_PAGE_VIRTUAL */ +}; + +/* + * FIXME: take this include out, include page-flags.h in + * files which need it (119 of them) + */ +#include <linux/page-flags.h> + +/* + * Methods to modify the page usage count. + * + * What counts for a page usage: + * - cache mapping (page->mapping) + * - private data (page->private) + * - page mapped in a task's page tables, each mapping + * is counted separately + * + * Also, many kernel routines increase the page count before a critical + * routine so they can be sure the page doesn't go away from under them. + */ +#define put_page_testzero(p) \ + ({ \ + BUG_ON(page_count(p) == 0); \ + atomic_dec_and_test(&(p)->count); \ + }) + +#define page_count(p) atomic_read(&(p)->count) +#define set_page_count(p,v) atomic_set(&(p)->count, v) +#define __put_page(p) atomic_dec(&(p)->count) + +extern void FASTCALL(__page_cache_release(struct page *)); + +#ifdef CONFIG_HUGETLB_PAGE + +static inline void get_page(struct page *page) +{ + if (PageCompound(page)) + page = (struct page *)page->lru.next; + atomic_inc(&page->count); +} + +static inline void put_page(struct page *page) +{ + if (PageCompound(page)) { + page = (struct page *)page->lru.next; + if (put_page_testzero(page)) { + if (page->lru.prev) { /* destructor? */ + (*(void (*)(struct page *))page->lru.prev)(page); + } else { + __page_cache_release(page); + } + } + return; + } + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#else /* CONFIG_HUGETLB_PAGE */ + +static inline void get_page(struct page *page) +{ + atomic_inc(&page->count); +} + +static inline void put_page(struct page *page) +{ + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Multiple processes may "see" the same page. E.g. for untouched + * mappings of /dev/null, all processes see the same page full of + * zeroes, and text pages of executables and shared libraries have + * only one copy in memory, at most, normally. + * + * For the non-reserved pages, page->count denotes a reference count. + * page->count == 0 means the page is free. + * page->count == 1 means the page is used for exactly one purpose + * (e.g. a private data page of one process). + * + * A page may be used for kmalloc() or anyone else who does a + * __get_free_page(). In this case the page->count is at least 1, and + * all other fields are unused but should be 0 or NULL. The + * management of this page is the responsibility of the one who uses + * it. + * + * The other pages (we may call them "process pages") are completely + * managed by the Linux memory manager: I/O, buffers, swapping etc. + * The following discussion applies only to them. + * + * A page may belong to an inode's memory mapping. In this case, + * page->mapping is the pointer to the inode, and page->index is the + * file offset of the page, in units of PAGE_CACHE_SIZE. + * + * A page contains an opaque `private' member, which belongs to the + * page's address_space. Usually, this is the address of a circular + * list of the page's disk buffers. + * + * For pages belonging to inodes, the page->count is the number of + * attaches, plus 1 if `private' contains something, plus one for + * the page cache itself. + * + * All pages belonging to an inode are in these doubly linked lists: + * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; + * using the page->list list_head. These fields are also used for + * freelist managemet (when page->count==0). + * + * There is also a per-mapping radix tree mapping index to the page + * in memory if present. The tree is rooted at mapping->root. + * + * All process pages can do I/O: + * - inode pages may need to be read from disk, + * - inode pages which have been modified and are MAP_SHARED may need + * to be written to disk, + * - private pages which have been modified may need to be swapped out + * to swap space and (later) to be read back into memory. + */ + +/* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + */ +#define ZONE_SHIFT (BITS_PER_LONG - 8) + +struct zone; +extern struct zone *zone_table[]; + +static inline struct zone *page_zone(struct page *page) +{ + return zone_table[page->flags >> ZONE_SHIFT]; +} + +static inline void set_page_zone(struct page *page, unsigned long zone_num) +{ + page->flags &= ~(~0UL << ZONE_SHIFT); + page->flags |= zone_num << ZONE_SHIFT; +} + +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) +{ + return __va(page_to_pfn(page) << PAGE_SHIFT); +} + +#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) +#define HASHED_PAGE_VIRTUAL +#endif + +#if defined(WANT_PAGE_VIRTUAL) +#define page_address(page) ((page)->virtual) +#define set_page_address(page, address) \ + do { \ + (page)->virtual = (address); \ + } while(0) +#define page_address_init() do { } while(0) +#endif + +#if defined(HASHED_PAGE_VIRTUAL) +void *page_address(struct page *page); +void set_page_address(struct page *page, void *virtual); +void page_address_init(void); +#endif + +#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) +#define page_address(page) lowmem_page_address(page) +#define set_page_address(page, address) do { } while(0) +#define page_address_init() do { } while(0) +#endif + +/* + * Return true if this page is mapped into pagetables. Subtle: test pte.direct + * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain + * is only 32-bit. + */ +static inline int page_mapped(struct page *page) +{ + return page->pte.direct != 0; +} + +/* + * Error return values for the *_nopage functions + */ +#define NOPAGE_SIGBUS (NULL) +#define NOPAGE_OOM ((struct page *) (-1)) + +/* + * Different kinds of faults, as returned by handle_mm_fault(). + * Used to decide whether a process gets delivered SIGBUS or + * just gets major/minor fault counters bumped up. + */ +#define VM_FAULT_OOM (-1) +#define VM_FAULT_SIGBUS 0 +#define VM_FAULT_MINOR 1 +#define VM_FAULT_MAJOR 2 + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +extern void show_free_areas(void); + +struct page *shmem_nopage(struct vm_area_struct * vma, + unsigned long address, int unused); +struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); +void shmem_lock(struct file * file, int lock); +int shmem_zero_setup(struct vm_area_struct *); + +void zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size); +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, + struct vm_area_struct *start_vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted); +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long address, unsigned long size); +void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma); +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long size, pgprot_t prot); + +extern void invalidate_mmap_range(struct address_space *mapping, + loff_t const holebegin, + loff_t const holelen); +extern int vmtruncate(struct inode * inode, loff_t offset); +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); +extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); +extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); +extern int make_pages_present(unsigned long addr, unsigned long end); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); +extern long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); +void put_dirty_page(struct task_struct *tsk, struct page *page, + unsigned long address, pgprot_t prot); + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + +int __set_page_dirty_buffers(struct page *page); +int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty_lock(struct page *page); + +/* + * Prototype to add a shrinker callback for ageable caches. + * + * These functions are passed a count `nr_to_scan' and a gfpmask. They should + * scan `nr_to_scan' objects, attempting to free them. + * + * The callback must the number of objects which remain in the cache. + * + * The callback will be passes nr_to_scan == 0 when the VM is querying the + * cache size, so a fastpath for that case is appropriate. + */ +typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); + +/* + * Add an aging callback. The int is the number of 'seeks' it takes + * to recreate one of the objects that these functions age. + */ + +#define DEFAULT_SEEKS 2 +struct shrinker; +extern struct shrinker *set_shrinker(int, shrinker_t); +extern void remove_shrinker(struct shrinker *shrinker); + +/* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + * FIXME: make the method unconditional. + */ +static inline int set_page_dirty(struct page *page) +{ + if (page->mapping) { + int (*spd)(struct page *); + + spd = page->mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + } + return __set_page_dirty_buffers(page); +} + +extern long do_mprotect(struct mm_struct *mm, unsigned long start, + size_t len, unsigned long prot); + +/* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc_map() that does all + * of this out-of-line. + */ +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + if (pgd_none(*pgd)) + return __pmd_alloc(mm, pgd, address); + return pmd_offset(pgd, address); +} + +extern void free_area_init(unsigned long * zones_size); +extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, + unsigned long * zones_size, unsigned long zone_start_pfn, + unsigned long *zholes_size); +extern void memmap_init_zone(struct page *, unsigned long, int, + unsigned long, unsigned long); +extern void mem_init(void); +extern void show_mem(void); +extern void si_meminfo(struct sysinfo * val); +extern void si_meminfo_node(struct sysinfo *val, int nid); + +/* mmap.c */ +extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); +extern void build_mmap_rb(struct mm_struct *); +extern void exit_mmap(struct mm_struct *); + +extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +extern unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flag, + unsigned long pgoff); + +static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + unsigned long ret = -EINVAL; + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) + ret = do_mmap_pgoff(current->mm, file, addr, len, prot, flag, + offset >> PAGE_SHIFT); +out: + return ret; +} + +extern int do_munmap(struct mm_struct *, unsigned long, size_t); + +extern unsigned long do_brk(unsigned long, unsigned long); + +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + +static inline int +can_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) +{ +#ifdef CONFIG_MMU + if (!vma->vm_file && vma->vm_flags == vm_flags) + return 1; +#endif + return 0; +} + +/* filemap.c */ +extern unsigned long page_unuse(struct page *); +extern void truncate_inode_pages(struct address_space *, loff_t); +extern void truncate_mapping_pages_range(struct address_space *mapping, + pgoff_t lstart, long count); + +/* generic vm_area_ops exported for stackable file systems */ +extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); + +/* mm/page-writeback.c */ +int write_one_page(struct page *page, int wait); + +/* readahead.c */ +#define VM_MAX_READAHEAD 128 /* kbytes */ +#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ + +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +void page_cache_readahead(struct address_space *mapping, + struct file_ra_state *ra, + struct file *filp, + unsigned long offset); +void handle_ra_miss(struct address_space *mapping, + struct file_ra_state *ra, pgoff_t offset); +unsigned long max_sane_readahead(unsigned long nr); + +/* Do stack extension */ +extern int expand_stack(struct vm_area_struct * vma, unsigned long address); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev); +extern int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long addr, int new_below); + +/* Look up the first VMA which intersects the interval start_addr..end_addr-1, + NULL if none. Assume start_addr < end_addr. */ +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +{ + struct vm_area_struct * vma = find_vma(mm,start_addr); + + if (vma && end_addr <= vma->vm_start) + vma = NULL; + return vma; +} + +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); + +extern unsigned int nr_used_zone_pages(void); + +extern struct page * vmalloc_to_page(void *addr); +extern struct page * follow_page(struct mm_struct *mm, unsigned long address, + int write); +extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long to, unsigned long size, pgprot_t prot); + +#ifndef CONFIG_DEBUG_PAGEALLOC +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ +} +#endif + +#endif /* __KERNEL__ */ +#endif /* _LINUX_MM_H */ diff -Nru a/include/linux/mm.h~truncate_mapping_pages_range.diff b/include/linux/mm.h~truncate_mapping_pages_range.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/mm.h~truncate_mapping_pages_range.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,631 @@ +#ifndef _LINUX_MM_H +#define _LINUX_MM_H + +#include <linux/sched.h> +#include <linux/errno.h> + +#ifdef __KERNEL__ + +#include <linux/config.h> +#include <linux/gfp.h> +#include <linux/list.h> +#include <linux/mmzone.h> +#include <linux/rbtree.h> +#include <linux/fs.h> + +#ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ +extern unsigned long max_mapnr; +#endif + +extern unsigned long num_physpages; +extern void * high_memory; +extern int page_cluster; + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/processor.h> +#include <asm/atomic.h> + +#ifndef MM_VM_SIZE +#define MM_VM_SIZE(mm) TASK_SIZE +#endif + +/* + * Linux kernel virtual memory manager primitives. + * The idea being to have a "virtual" mm in the same way + * we have a virtual fs - giving a cleaner interface to the + * mm details, and allowing different kinds of memory mappings + * (from shared memory to executable loading to arbitrary + * mmap() functions). + */ + +/* + * This struct defines a memory VMM memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory + * space that has a special rule for the page-fault handlers (ie a shared + * library, the executable area etc). + * + * This structure is exactly 64 bytes on ia32. Please think very, very hard + * before adding anything to it. + */ +struct vm_area_struct { + struct mm_struct * vm_mm; /* The address space we belong to. */ + unsigned long vm_start; /* Our start address within vm_mm. */ + unsigned long vm_end; /* The first byte after our end address + within vm_mm. */ + + /* linked list of VM areas per task, sorted by address */ + struct vm_area_struct *vm_next; + + pgprot_t vm_page_prot; /* Access permissions of this VMA. */ + unsigned long vm_flags; /* Flags, listed below. */ + + struct rb_node vm_rb; + + /* + * For areas with an address space and backing store, + * one of the address_space->i_mmap{,shared} lists, + * for shm areas, the list of attaches, otherwise unused. + */ + struct list_head shared; + + /* Function pointers to deal with this struct. */ + struct vm_operations_struct * vm_ops; + + /* Information about our backing store: */ + unsigned long vm_pgoff; /* Offset (within vm_file) in PAGE_SIZE + units, *not* PAGE_CACHE_SIZE */ + struct file * vm_file; /* File we map to (can be NULL). */ + void * vm_private_data; /* was vm_pte (shared mem) */ +}; + +/* + * vm_flags.. + */ +#define VM_READ 0x00000001 /* currently active flags */ +#define VM_WRITE 0x00000002 +#define VM_EXEC 0x00000004 +#define VM_SHARED 0x00000008 + +#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ +#define VM_MAYWRITE 0x00000020 +#define VM_MAYEXEC 0x00000040 +#define VM_MAYSHARE 0x00000080 + +#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define VM_GROWSUP 0x00000200 +#define VM_SHM 0x00000400 /* shared memory area, don't swap out */ +#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ + +#define VM_EXECUTABLE 0x00001000 +#define VM_LOCKED 0x00002000 +#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ + + /* Used by sys_madvise() */ +#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ +#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ + +#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ +#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ +#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ +#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS +#endif + +#ifdef CONFIG_STACK_GROWSUP +#define VM_STACK_FLAGS (VM_GROWSUP | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#else +#define VM_STACK_FLAGS (VM_GROWSDOWN | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) +#endif + +#define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) +#define VM_ClearReadHint(v) (v)->vm_flags &= ~VM_READHINTMASK +#define VM_NormalReadHint(v) (!((v)->vm_flags & VM_READHINTMASK)) +#define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) +#define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) + +/* + * mapping from the currently active vm_flags protection bits (the + * low four bits) to a page protection mask.. + */ +extern pgprot_t protection_map[16]; + + +/* + * These are the virtual MM functions - opening of an area, closing and + * unmapping it (needed to keep files on disk up-to-date etc), pointer + * to the functions called when a no-page or a wp-page exception occurs. + */ +struct vm_operations_struct { + void (*open)(struct vm_area_struct * area); + void (*close)(struct vm_area_struct * area); + struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused); + int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); +}; + +/* forward declaration; pte_chain is meant to be internal to rmap.c */ +struct pte_chain; +struct mmu_gather; +struct inode; + +/* + * Each physical page in the system has a struct page associated with + * it to keep track of whatever it is we are using the page for at the + * moment. Note that we have no way to track which tasks are using + * a page. + * + * Try to keep the most commonly accessed fields in single cache lines + * here (16 bytes or greater). This ordering should be particularly + * beneficial on 32-bit processors. + * + * The first line is data used in page cache lookup, the second line + * is used for linear searches (eg. clock algorithm scans). + * + * TODO: make this structure smaller, it could be as small as 32 bytes. + */ +struct page { + unsigned long flags; /* atomic flags, some possibly + updated asynchronously */ + atomic_t count; /* Usage count, see below. */ + struct list_head list; /* ->mapping has some page lists. */ + struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long index; /* Our offset within mapping. */ + struct list_head lru; /* Pageout list, eg. active_list; + protected by zone->lru_lock !! */ + union { + struct pte_chain *chain;/* Reverse pte mapping pointer. + * protected by PG_chainlock */ + pte_addr_t direct; + } pte; + unsigned long private; /* mapping-private opaque data */ + + /* + * On machines where all RAM is mapped into kernel address space, + * we can simply calculate the virtual address. On machines with + * highmem some memory is mapped into kernel virtual memory + * dynamically, so we need a place to store that address. + * Note that this field could be 16 bits on x86 ... ;) + * + * Architectures with slow multiplication can define + * WANT_PAGE_VIRTUAL in asm/page.h + */ +#if defined(WANT_PAGE_VIRTUAL) + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ +#endif /* WANT_PAGE_VIRTUAL */ +}; + +/* + * FIXME: take this include out, include page-flags.h in + * files which need it (119 of them) + */ +#include <linux/page-flags.h> + +/* + * Methods to modify the page usage count. + * + * What counts for a page usage: + * - cache mapping (page->mapping) + * - private data (page->private) + * - page mapped in a task's page tables, each mapping + * is counted separately + * + * Also, many kernel routines increase the page count before a critical + * routine so they can be sure the page doesn't go away from under them. + */ +#define put_page_testzero(p) \ + ({ \ + BUG_ON(page_count(p) == 0); \ + atomic_dec_and_test(&(p)->count); \ + }) + +#define page_count(p) atomic_read(&(p)->count) +#define set_page_count(p,v) atomic_set(&(p)->count, v) +#define __put_page(p) atomic_dec(&(p)->count) + +extern void FASTCALL(__page_cache_release(struct page *)); + +#ifdef CONFIG_HUGETLB_PAGE + +static inline void get_page(struct page *page) +{ + if (PageCompound(page)) + page = (struct page *)page->lru.next; + atomic_inc(&page->count); +} + +static inline void put_page(struct page *page) +{ + if (PageCompound(page)) { + page = (struct page *)page->lru.next; + if (put_page_testzero(page)) { + if (page->lru.prev) { /* destructor? */ + (*(void (*)(struct page *))page->lru.prev)(page); + } else { + __page_cache_release(page); + } + } + return; + } + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#else /* CONFIG_HUGETLB_PAGE */ + +static inline void get_page(struct page *page) +{ + atomic_inc(&page->count); +} + +static inline void put_page(struct page *page) +{ + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} + +#endif /* CONFIG_HUGETLB_PAGE */ + +/* + * Multiple processes may "see" the same page. E.g. for untouched + * mappings of /dev/null, all processes see the same page full of + * zeroes, and text pages of executables and shared libraries have + * only one copy in memory, at most, normally. + * + * For the non-reserved pages, page->count denotes a reference count. + * page->count == 0 means the page is free. + * page->count == 1 means the page is used for exactly one purpose + * (e.g. a private data page of one process). + * + * A page may be used for kmalloc() or anyone else who does a + * __get_free_page(). In this case the page->count is at least 1, and + * all other fields are unused but should be 0 or NULL. The + * management of this page is the responsibility of the one who uses + * it. + * + * The other pages (we may call them "process pages") are completely + * managed by the Linux memory manager: I/O, buffers, swapping etc. + * The following discussion applies only to them. + * + * A page may belong to an inode's memory mapping. In this case, + * page->mapping is the pointer to the inode, and page->index is the + * file offset of the page, in units of PAGE_CACHE_SIZE. + * + * A page contains an opaque `private' member, which belongs to the + * page's address_space. Usually, this is the address of a circular + * list of the page's disk buffers. + * + * For pages belonging to inodes, the page->count is the number of + * attaches, plus 1 if `private' contains something, plus one for + * the page cache itself. + * + * All pages belonging to an inode are in these doubly linked lists: + * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages; + * using the page->list list_head. These fields are also used for + * freelist managemet (when page->count==0). + * + * There is also a per-mapping radix tree mapping index to the page + * in memory if present. The tree is rooted at mapping->root. + * + * All process pages can do I/O: + * - inode pages may need to be read from disk, + * - inode pages which have been modified and are MAP_SHARED may need + * to be written to disk, + * - private pages which have been modified may need to be swapped out + * to swap space and (later) to be read back into memory. + */ + +/* + * The zone field is never updated after free_area_init_core() + * sets it, so none of the operations on it need to be atomic. + */ +#define ZONE_SHIFT (BITS_PER_LONG - 8) + +struct zone; +extern struct zone *zone_table[]; + +static inline struct zone *page_zone(struct page *page) +{ + return zone_table[page->flags >> ZONE_SHIFT]; +} + +static inline void set_page_zone(struct page *page, unsigned long zone_num) +{ + page->flags &= ~(~0UL << ZONE_SHIFT); + page->flags |= zone_num << ZONE_SHIFT; +} + +#ifndef CONFIG_DISCONTIGMEM +/* The array of struct pages - for discontigmem use pgdat->lmem_map */ +extern struct page *mem_map; +#endif + +static inline void *lowmem_page_address(struct page *page) +{ + return __va(page_to_pfn(page) << PAGE_SHIFT); +} + +#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) +#define HASHED_PAGE_VIRTUAL +#endif + +#if defined(WANT_PAGE_VIRTUAL) +#define page_address(page) ((page)->virtual) +#define set_page_address(page, address) \ + do { \ + (page)->virtual = (address); \ + } while(0) +#define page_address_init() do { } while(0) +#endif + +#if defined(HASHED_PAGE_VIRTUAL) +void *page_address(struct page *page); +void set_page_address(struct page *page, void *virtual); +void page_address_init(void); +#endif + +#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) +#define page_address(page) lowmem_page_address(page) +#define set_page_address(page, address) do { } while(0) +#define page_address_init() do { } while(0) +#endif + +/* + * Return true if this page is mapped into pagetables. Subtle: test pte.direct + * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain + * is only 32-bit. + */ +static inline int page_mapped(struct page *page) +{ + return page->pte.direct != 0; +} + +/* + * Error return values for the *_nopage functions + */ +#define NOPAGE_SIGBUS (NULL) +#define NOPAGE_OOM ((struct page *) (-1)) + +/* + * Different kinds of faults, as returned by handle_mm_fault(). + * Used to decide whether a process gets delivered SIGBUS or + * just gets major/minor fault counters bumped up. + */ +#define VM_FAULT_OOM (-1) +#define VM_FAULT_SIGBUS 0 +#define VM_FAULT_MINOR 1 +#define VM_FAULT_MAJOR 2 + +#define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) + +extern void show_free_areas(void); + +struct page *shmem_nopage(struct vm_area_struct * vma, + unsigned long address, int unused); +struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags); +void shmem_lock(struct file * file, int lock); +int shmem_zero_setup(struct vm_area_struct *); + +void zap_page_range(struct vm_area_struct *vma, unsigned long address, + unsigned long size); +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, + struct vm_area_struct *start_vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted); +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long address, unsigned long size); +void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr); +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma); +int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long size, pgprot_t prot); + +extern void invalidate_mmap_range(struct address_space *mapping, + loff_t const holebegin, + loff_t const holelen); +extern int vmtruncate(struct inode * inode, loff_t offset); +extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); +extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); +extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); +extern int make_pages_present(unsigned long addr, unsigned long end); +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); +extern long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); +void put_dirty_page(struct task_struct *tsk, struct page *page, + unsigned long address, pgprot_t prot); + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, + int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); + +int __set_page_dirty_buffers(struct page *page); +int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty_lock(struct page *page); + +/* + * Prototype to add a shrinker callback for ageable caches. + * + * These functions are passed a count `nr_to_scan' and a gfpmask. They should + * scan `nr_to_scan' objects, attempting to free them. + * + * The callback must the number of objects which remain in the cache. + * + * The callback will be passes nr_to_scan == 0 when the VM is querying the + * cache size, so a fastpath for that case is appropriate. + */ +typedef int (*shrinker_t)(int nr_to_scan, unsigned int gfp_mask); + +/* + * Add an aging callback. The int is the number of 'seeks' it takes + * to recreate one of the objects that these functions age. + */ + +#define DEFAULT_SEEKS 2 +struct shrinker; +extern struct shrinker *set_shrinker(int, shrinker_t); +extern void remove_shrinker(struct shrinker *shrinker); + +/* + * If the mapping doesn't provide a set_page_dirty a_op, then + * just fall through and assume that it wants buffer_heads. + * FIXME: make the method unconditional. + */ +static inline int set_page_dirty(struct page *page) +{ + if (page->mapping) { + int (*spd)(struct page *); + + spd = page->mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + } + return __set_page_dirty_buffers(page); +} + +extern long do_mprotect(struct mm_struct *mm, unsigned long start, + size_t len, unsigned long prot); + +/* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc_map() that does all + * of this out-of-line. + */ +static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + if (pgd_none(*pgd)) + return __pmd_alloc(mm, pgd, address); + return pmd_offset(pgd, address); +} + +extern void free_area_init(unsigned long * zones_size); +extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, + unsigned long * zones_size, unsigned long zone_start_pfn, + unsigned long *zholes_size); +extern void memmap_init_zone(struct page *, unsigned long, int, + unsigned long, unsigned long); +extern void mem_init(void); +extern void show_mem(void); +extern void si_meminfo(struct sysinfo * val); +extern void si_meminfo_node(struct sysinfo *val, int nid); + +/* mmap.c */ +extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *); +extern void build_mmap_rb(struct mm_struct *); +extern void exit_mmap(struct mm_struct *); + +extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +extern unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flag, + unsigned long pgoff); + +static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flag, unsigned long offset) +{ + unsigned long ret = -EINVAL; + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) + ret = do_mmap_pgoff(current->mm, file, addr, len, prot, flag, + offset >> PAGE_SHIFT); +out: + return ret; +} + +extern int do_munmap(struct mm_struct *, unsigned long, size_t); + +extern unsigned long do_brk(unsigned long, unsigned long); + +static inline void +__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev) +{ + prev->vm_next = vma->vm_next; + rb_erase(&vma->vm_rb, &mm->mm_rb); + if (mm->mmap_cache == vma) + mm->mmap_cache = prev; +} + +static inline int +can_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags) +{ +#ifdef CONFIG_MMU + if (!vma->vm_file && vma->vm_flags == vm_flags) + return 1; +#endif + return 0; +} + +/* filemap.c */ +extern unsigned long page_unuse(struct page *); +extern void truncate_inode_pages(struct address_space *, loff_t); + +/* generic vm_area_ops exported for stackable file systems */ +extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); + +/* mm/page-writeback.c */ +int write_one_page(struct page *page, int wait); + +/* readahead.c */ +#define VM_MAX_READAHEAD 128 /* kbytes */ +#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ + +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read); +void page_cache_readahead(struct address_space *mapping, + struct file_ra_state *ra, + struct file *filp, + unsigned long offset); +void handle_ra_miss(struct address_space *mapping, + struct file_ra_state *ra, pgoff_t offset); +unsigned long max_sane_readahead(unsigned long nr); + +/* Do stack extension */ +extern int expand_stack(struct vm_area_struct * vma, unsigned long address); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); +extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, + struct vm_area_struct **pprev); +extern int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long addr, int new_below); + +/* Look up the first VMA which intersects the interval start_addr..end_addr-1, + NULL if none. Assume start_addr < end_addr. */ +static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr) +{ + struct vm_area_struct * vma = find_vma(mm,start_addr); + + if (vma && end_addr <= vma->vm_start) + vma = NULL; + return vma; +} + +extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); + +extern unsigned int nr_used_zone_pages(void); + +extern struct page * vmalloc_to_page(void *addr); +extern struct page * follow_page(struct mm_struct *mm, unsigned long address, + int write); +extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, + unsigned long to, unsigned long size, pgprot_t prot); + +#ifndef CONFIG_DEBUG_PAGEALLOC +static inline void +kernel_map_pages(struct page *page, int numpages, int enable) +{ +} +#endif + +#endif /* __KERNEL__ */ +#endif /* _LINUX_MM_H */ diff -Nru a/include/linux/netdevice.h b/include/linux/netdevice.h --- a/include/linux/netdevice.h Fri Oct 31 14:10:54 2003 +++ b/include/linux/netdevice.h Fri Oct 31 14:10:54 2003 @@ -498,6 +498,8 @@ extern int netdev_boot_setup_add(char *name, struct ifmap *map); extern int netdev_boot_setup_check(struct net_device *dev); extern struct net_device *dev_getbyhwaddr(unsigned short type, char *hwaddr); +extern struct net_device *__dev_getfirstbyhwtype(unsigned short type); +extern struct net_device *dev_getfirstbyhwtype(unsigned short type); extern void dev_add_pack(struct packet_type *pt); extern void dev_remove_pack(struct packet_type *pt); extern void __dev_remove_pack(struct packet_type *pt); diff -Nru a/include/linux/page-flags.h b/include/linux/page-flags.h --- a/include/linux/page-flags.h Fri Oct 31 14:10:54 2003 +++ b/include/linux/page-flags.h Fri Oct 31 14:10:54 2003 @@ -141,14 +141,48 @@ */ #define PageLocked(page) \ test_bit(PG_locked, &(page)->flags) -#define SetPageLocked(page) \ - set_bit(PG_locked, &(page)->flags) + +#ifdef CONFIG_REISER4_DEBUG +#define TestSetPageLocked(page) \ +({ \ + int ret; \ + \ + ret = test_and_set_bit(PG_locked, &(page)->flags); \ + if (!ret) \ + page->owner = current; \ + ret; \ +}) + +#define ClearPageLocked(page) \ +({ \ + clear_bit(PG_locked, &(page)->flags); \ + page->owner = NULL; \ +}) + +#define TestClearPageLocked(page) \ +({ \ + page->owner = NULL; \ + test_and_clear_bit(PG_locked, &(page)->flags); \ +}) + +#define SetPageLocked(page) \ +({ \ + \ + set_bit(PG_locked, &(page)->flags); \ + page->owner = current; \ +}) + +#else + #define TestSetPageLocked(page) \ test_and_set_bit(PG_locked, &(page)->flags) #define ClearPageLocked(page) \ clear_bit(PG_locked, &(page)->flags) #define TestClearPageLocked(page) \ test_and_clear_bit(PG_locked, &(page)->flags) +#define SetPageLocked(page) \ + set_bit(PG_locked, &(page)->flags) +#endif #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) diff -Nru a/include/linux/page-flags.h~page-owner.diff b/include/linux/page-flags.h~page-owner.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/page-flags.h~page-owner.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,292 @@ +/* + * Macros for manipulating and testing page->flags + */ + +#ifndef PAGE_FLAGS_H +#define PAGE_FLAGS_H + +#include <linux/percpu.h> +#include <linux/cache.h> +#include <asm/pgtable.h> + +/* + * Various page->flags bits: + * + * PG_reserved is set for special pages, which can never be swapped out. Some + * of them might not even exist (eg empty_bad_page)... + * + * The PG_private bitflag is set if page->private contains a valid value. + * + * During disk I/O, PG_locked is used. This bit is set before I/O and + * reset when I/O completes. page_waitqueue(page) is a wait queue of all tasks + * waiting for the I/O on this page to complete. + * + * PG_uptodate tells whether the page's contents is valid. When a read + * completes, the page becomes uptodate, unless a disk I/O error happened. + * + * For choosing which pages to swap out, inode pages carry a PG_referenced bit, + * which is set any time the system accesses that page through the (mapping, + * index) hash table. This referenced bit, together with the referenced bit + * in the page tables, is used to manipulate page->age and move the page across + * the active, inactive_dirty and inactive_clean lists. + * + * Note that the referenced bit, the page->lru list_head and the active, + * inactive_dirty and inactive_clean lists are protected by the + * zone->lru_lock, and *NOT* by the usual PG_locked bit! + * + * PG_error is set to indicate that an I/O error occurred on this page. + * + * PG_arch_1 is an architecture specific page state bit. The generic code + * guarantees that this bit is cleared for a page when it first is entered into + * the page cache. + * + * PG_highmem pages are not permanently mapped into the kernel virtual address + * space, they need to be kmapped separately for doing IO on the pages. The + * struct page (these bits with information) are always mapped into kernel + * address space... + */ + +/* + * Don't use the *_dontuse flags. Use the macros. Otherwise you'll break + * locked- and dirty-page accounting. The top eight bits of page->flags are + * used for page->zone, so putting flag bits there doesn't work. + */ +#define PG_locked 0 /* Page is locked. Don't touch. */ +#define PG_error 1 +#define PG_referenced 2 +#define PG_uptodate 3 + +#define PG_dirty 4 +#define PG_lru 5 +#define PG_active 6 +#define PG_slab 7 /* slab debug (Suparna wants this) */ + +#define PG_highmem 8 +#define PG_checked 9 /* kill me in 2.5.<early>. */ +#define PG_arch_1 10 +#define PG_reserved 11 + +#define PG_private 12 /* Has something at ->private */ +#define PG_writeback 13 /* Page is under writeback */ +#define PG_nosave 14 /* Used for system suspend/resume */ +#define PG_chainlock 15 /* lock bit for ->pte_chain */ + +#define PG_direct 16 /* ->pte_chain points directly at pte */ +#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ +#define PG_reclaim 18 /* To be reclaimed asap */ +#define PG_compound 19 /* Part of a compound page */ + + +/* + * Global page accounting. One instance per CPU. Only unsigned longs are + * allowed. + */ +struct page_state { + unsigned long nr_dirty; /* Dirty writeable pages */ + unsigned long nr_writeback; /* Pages under writeback */ + unsigned long nr_unstable; /* NFS unstable pages */ + unsigned long nr_page_table_pages;/* Pages used for pagetables */ + unsigned long nr_mapped; /* mapped into pagetables */ + unsigned long nr_slab; /* In slab */ +#define GET_PAGE_STATE_LAST nr_slab + + /* + * The below are zeroed by get_page_state(). Use get_full_page_state() + * to add up all these. + */ + unsigned long pgpgin; /* Disk reads */ + unsigned long pgpgout; /* Disk writes */ + unsigned long pswpin; /* swap reads */ + unsigned long pswpout; /* swap writes */ + unsigned long pgalloc; /* page allocations */ + + unsigned long pgfree; /* page freeings */ + unsigned long pgactivate; /* pages moved inactive->active */ + unsigned long pgdeactivate; /* pages moved active->inactive */ + unsigned long pgfault; /* faults (major+minor) */ + unsigned long pgmajfault; /* faults (major only) */ + + unsigned long pgscan; /* pages scanned by page reclaim */ + unsigned long pgrefill; /* inspected in refill_inactive_zone */ + unsigned long pgsteal; /* total pages reclaimed */ + unsigned long pginodesteal; /* pages reclaimed via inode freeing */ + unsigned long kswapd_steal; /* pages reclaimed by kswapd */ + + unsigned long kswapd_inodesteal;/* reclaimed via kswapd inode freeing */ + unsigned long pageoutrun; /* kswapd's calls to page reclaim */ + unsigned long allocstall; /* direct reclaim calls */ + unsigned long pgrotated; /* pages rotated to tail of the LRU */ +} ____cacheline_aligned; + +DECLARE_PER_CPU(struct page_state, page_states); + +extern void get_page_state(struct page_state *ret); +extern void get_full_page_state(struct page_state *ret); + +#define mod_page_state(member, delta) \ + do { \ + unsigned long flags; \ + local_irq_save(flags); \ + __get_cpu_var(page_states).member += (delta); \ + local_irq_restore(flags); \ + } while (0) + +#define inc_page_state(member) mod_page_state(member, 1UL) +#define dec_page_state(member) mod_page_state(member, 0UL - 1) +#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta)) + + +/* + * Manipulation of page state flags + */ +#define PageLocked(page) \ + test_bit(PG_locked, &(page)->flags) +#define SetPageLocked(page) \ + set_bit(PG_locked, &(page)->flags) +#define TestSetPageLocked(page) \ + test_and_set_bit(PG_locked, &(page)->flags) +#define ClearPageLocked(page) \ + clear_bit(PG_locked, &(page)->flags) +#define TestClearPageLocked(page) \ + test_and_clear_bit(PG_locked, &(page)->flags) + +#define PageError(page) test_bit(PG_error, &(page)->flags) +#define SetPageError(page) set_bit(PG_error, &(page)->flags) +#define ClearPageError(page) clear_bit(PG_error, &(page)->flags) + +#define PageReferenced(page) test_bit(PG_referenced, &(page)->flags) +#define SetPageReferenced(page) set_bit(PG_referenced, &(page)->flags) +#define ClearPageReferenced(page) clear_bit(PG_referenced, &(page)->flags) +#define TestClearPageReferenced(page) test_and_clear_bit(PG_referenced, &(page)->flags) + +#ifndef arch_set_page_uptodate +#define arch_set_page_uptodate(page) do { } while (0) +#endif + +#define PageUptodate(page) test_bit(PG_uptodate, &(page)->flags) +#define SetPageUptodate(page) \ + do { \ + arch_set_page_uptodate(page); \ + set_bit(PG_uptodate, &(page)->flags); \ + } while (0) +#define ClearPageUptodate(page) clear_bit(PG_uptodate, &(page)->flags) + +#define PageDirty(page) test_bit(PG_dirty, &(page)->flags) +#define SetPageDirty(page) set_bit(PG_dirty, &(page)->flags) +#define TestSetPageDirty(page) test_and_set_bit(PG_dirty, &(page)->flags) +#define ClearPageDirty(page) clear_bit(PG_dirty, &(page)->flags) +#define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags) + +#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) +#define PageLRU(page) test_bit(PG_lru, &(page)->flags) +#define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) +#define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) + +#define PageActive(page) test_bit(PG_active, &(page)->flags) +#define SetPageActive(page) set_bit(PG_active, &(page)->flags) +#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) +#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) + +#define PageSlab(page) test_bit(PG_slab, &(page)->flags) +#define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) +#define ClearPageSlab(page) clear_bit(PG_slab, &(page)->flags) +#define TestClearPageSlab(page) test_and_clear_bit(PG_slab, &(page)->flags) +#define TestSetPageSlab(page) test_and_set_bit(PG_slab, &(page)->flags) + +#ifdef CONFIG_HIGHMEM +#define PageHighMem(page) test_bit(PG_highmem, &(page)->flags) +#else +#define PageHighMem(page) 0 /* needed to optimize away at compile time */ +#endif + +#define PageChecked(page) test_bit(PG_checked, &(page)->flags) +#define SetPageChecked(page) set_bit(PG_checked, &(page)->flags) +#define ClearPageChecked(page) clear_bit(PG_checked, &(page)->flags) + +#define PageReserved(page) test_bit(PG_reserved, &(page)->flags) +#define SetPageReserved(page) set_bit(PG_reserved, &(page)->flags) +#define ClearPageReserved(page) clear_bit(PG_reserved, &(page)->flags) + +#define SetPagePrivate(page) set_bit(PG_private, &(page)->flags) +#define ClearPagePrivate(page) clear_bit(PG_private, &(page)->flags) +#define PagePrivate(page) test_bit(PG_private, &(page)->flags) + +#define PageWriteback(page) test_bit(PG_writeback, &(page)->flags) +#define SetPageWriteback(page) \ + do { \ + if (!test_and_set_bit(PG_writeback, \ + &(page)->flags)) \ + inc_page_state(nr_writeback); \ + } while (0) +#define TestSetPageWriteback(page) \ + ({ \ + int ret; \ + ret = test_and_set_bit(PG_writeback, \ + &(page)->flags); \ + if (!ret) \ + inc_page_state(nr_writeback); \ + ret; \ + }) +#define ClearPageWriteback(page) \ + do { \ + if (test_and_clear_bit(PG_writeback, \ + &(page)->flags)) \ + dec_page_state(nr_writeback); \ + } while (0) +#define TestClearPageWriteback(page) \ + ({ \ + int ret; \ + ret = test_and_clear_bit(PG_writeback, \ + &(page)->flags); \ + if (ret) \ + dec_page_state(nr_writeback); \ + ret; \ + }) + +#define PageNosave(page) test_bit(PG_nosave, &(page)->flags) +#define SetPageNosave(page) set_bit(PG_nosave, &(page)->flags) +#define TestSetPageNosave(page) test_and_set_bit(PG_nosave, &(page)->flags) +#define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) +#define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) + +#define PageDirect(page) test_bit(PG_direct, &(page)->flags) +#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) +#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) +#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) +#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) + +#define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) +#define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) +#define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) + +#define PageReclaim(page) test_bit(PG_reclaim, &(page)->flags) +#define SetPageReclaim(page) set_bit(PG_reclaim, &(page)->flags) +#define ClearPageReclaim(page) clear_bit(PG_reclaim, &(page)->flags) +#define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags) + +#define PageCompound(page) test_bit(PG_compound, &(page)->flags) +#define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) +#define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) + +/* + * The PageSwapCache predicate doesn't use a PG_flag at this time, + * but it may again do so one day. + */ +#ifdef CONFIG_SWAP +extern struct address_space swapper_space; +#define PageSwapCache(page) ((page)->mapping == &swapper_space) +#else +#define PageSwapCache(page) 0 +#endif + +struct page; /* forward declaration */ + +int test_clear_page_dirty(struct page *page); + +static inline void clear_page_dirty(struct page *page) +{ + test_clear_page_dirty(page); +} + +#endif /* PAGE_FLAGS_H */ diff -Nru a/include/linux/preempt.h b/include/linux/preempt.h --- a/include/linux/preempt.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/preempt.h Fri Oct 31 14:10:53 2003 @@ -32,8 +32,8 @@ #define preempt_enable_no_resched() \ do { \ - dec_preempt_count(); \ barrier(); \ + dec_preempt_count(); \ } while (0) #define preempt_check_resched() \ diff -Nru a/include/linux/quotaops.h b/include/linux/quotaops.h --- a/include/linux/quotaops.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/quotaops.h Fri Oct 31 14:10:53 2003 @@ -174,38 +174,38 @@ #define DQUOT_SYNC(sb) do { } while(0) #define DQUOT_OFF(sb) do { } while(0) #define DQUOT_TRANSFER(inode, iattr) (0) -extern __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { inode_add_bytes(inode, nr); return 0; } -extern __inline__ int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) +static inline int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) { DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr); mark_inode_dirty(inode); return 0; } -extern __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +static inline int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { inode_add_bytes(inode, nr); return 0; } -extern __inline__ int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) +static inline int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) { DQUOT_ALLOC_SPACE_NODIRTY(inode, nr); mark_inode_dirty(inode); return 0; } -extern __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +static inline void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { inode_sub_bytes(inode, nr); } -extern __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) +static inline void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) { DQUOT_FREE_SPACE_NODIRTY(inode, nr); mark_inode_dirty(inode); diff -Nru a/include/linux/quotaops.h~static-inline-quotaops.diff b/include/linux/quotaops.h~static-inline-quotaops.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/quotaops.h~static-inline-quotaops.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,223 @@ +/* + * Definitions for diskquota-operations. When diskquota is configured these + * macros expand to the right source-code. + * + * Author: Marco van Wieringen <mvw@planets.elm.net> + * + * Version: $Id: quotaops.h,v 1.2 1998/01/15 16:22:26 ecd Exp $ + * + */ +#ifndef _LINUX_QUOTAOPS_ +#define _LINUX_QUOTAOPS_ + +#include <linux/config.h> +#include <linux/smp_lock.h> + +#include <linux/fs.h> + +#if defined(CONFIG_QUOTA) + +/* + * declaration of quota_function calls in kernel. + */ +extern void sync_dquots(struct super_block *sb, int type); + +extern void dquot_initialize(struct inode *inode, int type); +extern void dquot_drop(struct inode *inode); + +extern int dquot_alloc_space(struct inode *inode, qsize_t number, int prealloc); +extern int dquot_alloc_inode(const struct inode *inode, unsigned long number); + +extern void dquot_free_space(struct inode *inode, qsize_t number); +extern void dquot_free_inode(const struct inode *inode, unsigned long number); + +extern int dquot_transfer(struct inode *inode, struct iattr *iattr); + +/* + * Operations supported for diskquotas. + */ +extern struct dquot_operations dquot_operations; +extern struct quotactl_ops vfs_quotactl_ops; + +#define sb_dquot_ops (&dquot_operations) +#define sb_quotactl_ops (&vfs_quotactl_ops) + +static __inline__ void DQUOT_INIT(struct inode *inode) +{ + if (!inode->i_sb) + BUG(); + if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) + inode->i_sb->dq_op->initialize(inode, -1); +} + +static __inline__ void DQUOT_DROP(struct inode *inode) +{ + if (IS_QUOTAINIT(inode)) { + if (!inode->i_sb) + BUG(); + inode->i_sb->dq_op->drop(inode); /* Ops must be set when there's any quota... */ + } +} + +static __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +{ + if (sb_any_quota_enabled(inode->i_sb)) { + /* Used space is updated in alloc_space() */ + if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA) + return 1; + } + else { + spin_lock(&dq_data_lock); + inode_add_bytes(inode, nr); + spin_unlock(&dq_data_lock); + } + return 0; +} + +static __inline__ int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) +{ + int ret; + if (!(ret = DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr))) + mark_inode_dirty(inode); + return ret; +} + +static __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +{ + if (sb_any_quota_enabled(inode->i_sb)) { + /* Used space is updated in alloc_space() */ + if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA) + return 1; + } + else { + spin_lock(&dq_data_lock); + inode_add_bytes(inode, nr); + spin_unlock(&dq_data_lock); + } + return 0; +} + +static __inline__ int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) +{ + int ret; + if (!(ret = DQUOT_ALLOC_SPACE_NODIRTY(inode, nr))) + mark_inode_dirty(inode); + return ret; +} + +static __inline__ int DQUOT_ALLOC_INODE(struct inode *inode) +{ + if (sb_any_quota_enabled(inode->i_sb)) { + DQUOT_INIT(inode); + if (inode->i_sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) + return 1; + } + return 0; +} + +static __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +{ + if (sb_any_quota_enabled(inode->i_sb)) + inode->i_sb->dq_op->free_space(inode, nr); + else { + spin_lock(&dq_data_lock); + inode_sub_bytes(inode, nr); + spin_unlock(&dq_data_lock); + } +} + +static __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) +{ + DQUOT_FREE_SPACE_NODIRTY(inode, nr); + mark_inode_dirty(inode); +} + +static __inline__ void DQUOT_FREE_INODE(struct inode *inode) +{ + if (sb_any_quota_enabled(inode->i_sb)) + inode->i_sb->dq_op->free_inode(inode, 1); +} + +static __inline__ int DQUOT_TRANSFER(struct inode *inode, struct iattr *iattr) +{ + if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) { + DQUOT_INIT(inode); + if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) + return 1; + } + return 0; +} + +#define DQUOT_SYNC(sb) sync_dquots(sb, -1) + +static __inline__ int DQUOT_OFF(struct super_block *sb) +{ + int ret = -ENOSYS; + + if (sb->s_qcop && sb->s_qcop->quota_off) + ret = sb->s_qcop->quota_off(sb, -1); + return ret; +} + +#else + +/* + * NO-OP when quota not configured. + */ +#define sb_dquot_ops (NULL) +#define sb_quotactl_ops (NULL) +#define sync_dquots_dev(dev,type) (NULL) +#define DQUOT_INIT(inode) do { } while(0) +#define DQUOT_DROP(inode) do { } while(0) +#define DQUOT_ALLOC_INODE(inode) (0) +#define DQUOT_FREE_INODE(inode) do { } while(0) +#define DQUOT_SYNC(sb) do { } while(0) +#define DQUOT_OFF(sb) do { } while(0) +#define DQUOT_TRANSFER(inode, iattr) (0) +extern __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +{ + inode_add_bytes(inode, nr); + return 0; +} + +extern __inline__ int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) +{ + DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr); + mark_inode_dirty(inode); + return 0; +} + +extern __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +{ + inode_add_bytes(inode, nr); + return 0; +} + +extern __inline__ int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) +{ + DQUOT_ALLOC_SPACE_NODIRTY(inode, nr); + mark_inode_dirty(inode); + return 0; +} + +extern __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) +{ + inode_sub_bytes(inode, nr); +} + +extern __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) +{ + DQUOT_FREE_SPACE_NODIRTY(inode, nr); + mark_inode_dirty(inode); +} + +#endif /* CONFIG_QUOTA */ + +#define DQUOT_PREALLOC_BLOCK_NODIRTY(inode, nr) DQUOT_PREALLOC_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_PREALLOC_BLOCK(inode, nr) DQUOT_PREALLOC_SPACE(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_ALLOC_BLOCK_NODIRTY(inode, nr) DQUOT_ALLOC_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_ALLOC_BLOCK(inode, nr) DQUOT_ALLOC_SPACE(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_FREE_BLOCK_NODIRTY(inode, nr) DQUOT_FREE_SPACE_NODIRTY(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) +#define DQUOT_FREE_BLOCK(inode, nr) DQUOT_FREE_SPACE(inode, ((qsize_t)(nr)) << (inode)->i_sb->s_blocksize_bits) + +#endif /* _LINUX_QUOTAOPS_ */ diff -Nru a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/sched.h Fri Oct 31 14:10:53 2003 @@ -308,6 +308,24 @@ struct backing_dev_info; struct reclaim_state; +/* + * Some file systems need context associated with current thread during + * one system call (transaction handle, for example). This context in + * attached to current->fs_context. + * + * As it is possible for file system calls to nest (through quota of VM + * call backs), every file system using current->fs_context should store + * original ->fs_context value of entrance and restore in on exit. + */ +struct fs_activation { + /* + * cookie allowing to distinguish file system instances + * (mounts). Usually this is pointer to the super block, but not + * necessary. This is used to tell reentrance. + */ + void *owner; +}; + /* POSIX.1b interval timer structure. */ struct k_itimer { struct list_head list; /* free/ allocate list */ @@ -451,8 +469,8 @@ /* context-switch lock */ spinlock_t switch_lock; -/* journalling filesystem info */ - void *journal_info; +/* info about current file system activation */ + struct fs_activation *fs_context; /* VM state */ struct reclaim_state *reclaim_state; @@ -483,6 +501,7 @@ /* Not implemented yet, only for 486*/ #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ +#define PF_DEAD 0x00000008 /* Dead */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ diff -Nru a/include/linux/sched.h~fs_activation.diff b/include/linux/sched.h~fs_activation.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/sched.h~fs_activation.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,906 @@ +#ifndef _LINUX_SCHED_H +#define _LINUX_SCHED_H + +#include <asm/param.h> /* for HZ */ + +#include <linux/config.h> +#include <linux/capability.h> +#include <linux/threads.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/timex.h> +#include <linux/jiffies.h> +#include <linux/rbtree.h> +#include <linux/thread_info.h> +#include <linux/cpumask.h> + +#include <asm/system.h> +#include <asm/semaphore.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/mmu.h> + +#include <linux/smp.h> +#include <linux/sem.h> +#include <linux/signal.h> +#include <linux/securebits.h> +#include <linux/fs_struct.h> +#include <linux/compiler.h> +#include <linux/completion.h> +#include <linux/pid.h> +#include <linux/percpu.h> + +struct exec_domain; + +/* + * cloning flags: + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_IDLETASK 0x00001000 /* set if new pid should be 0 (kernel only)*/ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ +#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ +#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ +#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ +#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ +#define CLONE_DETACHED 0x00400000 /* Not used - CLONE_THREAD implies detached uniquely */ +#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ +#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ +#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ + +/* + * List of flags we want to share for kernel threads, + * if only because they are not used by them anyway. + */ +#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) + +/* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ +extern unsigned long avenrun[]; /* Load averages */ + +#define FSHIFT 11 /* nr of bits of precision */ +#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ +#define LOAD_FREQ (5*HZ) /* 5 sec intervals */ +#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ +#define EXP_5 2014 /* 1/exp(5sec/5min) */ +#define EXP_15 2037 /* 1/exp(5sec/15min) */ + +#define CALC_LOAD(load,exp,n) \ + load *= exp; \ + load += n*(FIXED_1-exp); \ + load >>= FSHIFT; + +#define CT_TO_SECS(x) ((x) / HZ) +#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) + +extern int nr_threads; +extern int last_pid; +DECLARE_PER_CPU(unsigned long, process_counts); +extern int nr_processes(void); +extern unsigned long nr_running(void); +extern unsigned long nr_uninterruptible(void); +extern unsigned long nr_iowait(void); + +#include <linux/time.h> +#include <linux/param.h> +#include <linux/resource.h> +#include <linux/timer.h> + +#include <asm/processor.h> + +#define TASK_RUNNING 0 +#define TASK_INTERRUPTIBLE 1 +#define TASK_UNINTERRUPTIBLE 2 +#define TASK_STOPPED 4 +#define TASK_ZOMBIE 8 +#define TASK_DEAD 16 + +#define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) +#define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) + +#define __set_current_state(state_value) \ + do { current->state = (state_value); } while (0) +#define set_current_state(state_value) \ + set_mb(current->state, (state_value)) + +/* + * Scheduling policies + */ +#define SCHED_NORMAL 0 +#define SCHED_FIFO 1 +#define SCHED_RR 2 + +struct sched_param { + int sched_priority; +}; + +#ifdef __KERNEL__ + +#include <linux/spinlock.h> + +/* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ +extern rwlock_t tasklist_lock; +extern spinlock_t mmlist_lock; + +typedef struct task_struct task_t; + +extern void sched_init(void); +extern void init_idle(task_t *idle, int cpu); + +extern void show_state(void); +extern void show_regs(struct pt_regs *); + +/* + * TASK is a pointer to the task whose backtrace we want to see (or NULL for current + * task), SP is the stack pointer of the first frame that should be shown in the back + * trace (or NULL if the entire call-chain of the task should be shown). + */ +extern void show_stack(struct task_struct *task, unsigned long *sp); + +void io_schedule(void); +long io_schedule_timeout(long timeout); + +extern void cpu_init (void); +extern void trap_init(void); +extern void update_process_times(int user); +extern void update_one_process(struct task_struct *p, unsigned long user, + unsigned long system, int cpu); +extern void scheduler_tick(int user_tick, int system); +extern unsigned long cache_decay_ticks; + + +#define MAX_SCHEDULE_TIMEOUT LONG_MAX +extern signed long FASTCALL(schedule_timeout(signed long timeout)); +asmlinkage void schedule(void); + +struct namespace; + +/* Maximum number of active map areas.. This is a random (large) number */ +#define MAX_MAP_COUNT (65536) + +#include <linux/aio.h> + +struct mm_struct { + struct vm_area_struct * mmap; /* list of VMAs */ + struct rb_root mm_rb; + struct vm_area_struct * mmap_cache; /* last find_vma result */ + unsigned long free_area_cache; /* first hole */ + pgd_t * pgd; + atomic_t mm_users; /* How many users with user space? */ + atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + int map_count; /* number of VMAs */ + struct rw_semaphore mmap_sem; + spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ + + struct list_head mmlist; /* List of all active mm's. These are globally strung + * together off init_mm.mmlist, and are protected + * by mmlist_lock + */ + + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long rss, total_vm, locked_vm; + unsigned long def_flags; + cpumask_t cpu_vm_mask; + unsigned long swap_address; + + unsigned long saved_auxv[40]; /* for /proc/PID/auxv */ + + unsigned dumpable:1; +#ifdef CONFIG_HUGETLB_PAGE + int used_hugetlb; +#endif + /* Architecture-specific MM context */ + mm_context_t context; + + /* coredumping support */ + int core_waiters; + struct completion *core_startup_done, core_done; + + /* aio bits */ + rwlock_t ioctx_list_lock; + struct kioctx *ioctx_list; + + struct kioctx default_kioctx; +}; + +extern int mmlist_nr; + +struct sighand_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; +}; + +/* + * NOTE! "signal_struct" does not have it's own + * locking, because a shared signal_struct always + * implies a shared sighand_struct, so locking + * sighand_struct is always a proper superset of + * the locking of signal_struct. + */ +struct signal_struct { + atomic_t count; + + /* current thread group signal load-balancing target: */ + task_t *curr_target; + + /* shared signal handling: */ + struct sigpending shared_pending; + + /* thread group exit support */ + int group_exit; + int group_exit_code; + /* overloaded: + * - notify group_exit_task when ->count is equal to notify_count + * - everyone except group_exit_task is stopped during signal delivery + * of fatal signals, group_exit_task processes the signal. + */ + struct task_struct *group_exit_task; + int notify_count; + + /* thread group stop support, overloads group_exit_code too */ + int group_stop_count; +}; + +/* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * are inverted: lower p->prio value means higher priority. + * + * The MAX_RT_USER_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +#define rt_task(p) ((p)->prio < MAX_RT_PRIO) + +/* + * Some day this will be a full-fledged user tracking system.. + */ +struct user_struct { + atomic_t __count; /* reference count */ + atomic_t processes; /* How many processes does this user have? */ + atomic_t files; /* How many open files does this user have? */ + + /* Hash table maintenance information */ + struct list_head uidhash_list; + uid_t uid; +}; + +extern struct user_struct *find_user(uid_t); + +extern struct user_struct root_user; +#define INIT_USER (&root_user) + +typedef struct prio_array prio_array_t; +struct backing_dev_info; +struct reclaim_state; + +/* POSIX.1b interval timer structure. */ +struct k_itimer { + struct list_head list; /* free/ allocate list */ + spinlock_t it_lock; + clockid_t it_clock; /* which timer type */ + timer_t it_id; /* timer id */ + int it_overrun; /* overrun on pending signal */ + int it_overrun_last; /* overrun on last delivered signal */ + int it_requeue_pending; /* waiting to requeue this timer */ + int it_sigev_notify; /* notify word of sigevent struct */ + int it_sigev_signo; /* signo word of sigevent struct */ + sigval_t it_sigev_value; /* value word of sigevent struct */ + unsigned long it_incr; /* interval specified in jiffies */ + struct task_struct *it_process; /* process to send signal to */ + struct timer_list it_timer; + struct sigqueue *sigq; /* signal queue entry. */ +}; + + +struct io_context; /* See blkdev.h */ +void exit_io_context(void); + +struct task_struct { + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + struct thread_info *thread_info; + atomic_t usage; + unsigned long flags; /* per process flags, defined below */ + unsigned long ptrace; + + int lock_depth; /* Lock depth */ + + int prio, static_prio; + struct list_head run_list; + prio_array_t *array; + + unsigned long sleep_avg; + long interactive_credit; + unsigned long long timestamp; + int activated; + + unsigned long policy; + cpumask_t cpus_allowed; + unsigned int time_slice, first_time_slice; + + struct list_head tasks; + struct list_head ptrace_children; + struct list_head ptrace_list; + + struct mm_struct *mm, *active_mm; + +/* task state */ + struct linux_binfmt *binfmt; + int exit_code, exit_signal; + int pdeath_signal; /* The signal sent when the parent dies */ + /* ??? */ + unsigned long personality; + int did_exec:1; + pid_t pid; + pid_t __pgrp; /* Accessed via process_group() */ + pid_t tty_old_pgrp; + pid_t session; + pid_t tgid; + /* boolean value for session group leader */ + int leader; + /* + * pointers to (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->parent->pid) + */ + struct task_struct *real_parent; /* real parent process (when being debugged) */ + struct task_struct *parent; /* parent process */ + struct list_head children; /* list of my children */ + struct list_head sibling; /* linkage in my parent's children list */ + struct task_struct *group_leader; /* threadgroup leader */ + + /* PID/PID hash table linkage. */ + struct pid_link pids[PIDTYPE_MAX]; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + struct completion *vfork_done; /* for vfork() */ + int __user *set_child_tid; /* CLONE_CHILD_SETTID */ + int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ + + unsigned long rt_priority; + unsigned long it_real_value, it_prof_value, it_virt_value; + unsigned long it_real_incr, it_prof_incr, it_virt_incr; + struct timer_list real_timer; + struct list_head posix_timers; /* POSIX.1b Interval Timers */ + unsigned long utime, stime, cutime, cstime; + unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */ + u64 start_time; +/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ + unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; +/* process credentials */ + uid_t uid,euid,suid,fsuid; + gid_t gid,egid,sgid,fsgid; + int ngroups; + gid_t groups[NGROUPS]; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + int keep_capabilities:1; + struct user_struct *user; +/* limits */ + struct rlimit rlim[RLIM_NLIMITS]; + unsigned short used_math; + char comm[16]; +/* file system info */ + int link_count, total_link_count; + struct tty_struct *tty; /* NULL if no tty */ +/* ipc stuff */ + struct sysv_sem sysvsem; +/* CPU-specific state of this task */ + struct thread_struct thread; +/* filesystem information */ + struct fs_struct *fs; +/* open file information */ + struct files_struct *files; +/* namespace */ + struct namespace *namespace; +/* signal handlers */ + struct signal_struct *signal; + struct sighand_struct *sighand; + + sigset_t blocked, real_blocked; + struct sigpending pending; + + unsigned long sas_ss_sp; + size_t sas_ss_size; + int (*notifier)(void *priv); + void *notifier_data; + sigset_t *notifier_mask; + + void *security; + +/* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; +/* Protection of (de-)allocation: mm, files, fs, tty */ + spinlock_t alloc_lock; +/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ + spinlock_t proc_lock; +/* context-switch lock */ + spinlock_t switch_lock; + +/* journalling filesystem info */ + void *journal_info; + +/* VM state */ + struct reclaim_state *reclaim_state; + + struct dentry *proc_dentry; + struct backing_dev_info *backing_dev_info; + + struct io_context *io_context; + + unsigned long ptrace_message; + siginfo_t *last_siginfo; /* For ptrace use. */ +}; + +static inline pid_t process_group(struct task_struct *tsk) +{ + return tsk->group_leader->__pgrp; +} + +extern void __put_task_struct(struct task_struct *tsk); +#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) +#define put_task_struct(tsk) \ +do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0) + +/* + * Per process flags + */ +#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ + /* Not implemented yet, only for 486*/ +#define PF_STARTING 0x00000002 /* being created */ +#define PF_EXITING 0x00000004 /* getting shut down */ +#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ +#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ +#define PF_DUMPCORE 0x00000200 /* dumped core */ +#define PF_SIGNALED 0x00000400 /* killed by a signal */ +#define PF_MEMALLOC 0x00000800 /* Allocating memory */ +#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */ +#define PF_FLUSHER 0x00002000 /* responsible for disk writeback */ + +#define PF_FREEZE 0x00004000 /* this task should be frozen for suspend */ +#define PF_IOTHREAD 0x00008000 /* this thread is needed for doing I/O to swap */ +#define PF_FROZEN 0x00010000 /* frozen for system suspend */ +#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ +#define PF_KSWAPD 0x00040000 /* I am kswapd */ +#define PF_SWAPOFF 0x00080000 /* I am in swapoff */ +#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ +#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ + +#ifdef CONFIG_SMP +extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); +#else +static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + return 0; +} +#endif + +extern unsigned long long sched_clock(void); + +#ifdef CONFIG_NUMA +extern void sched_balance_exec(void); +extern void node_nr_running_init(void); +#else +#define sched_balance_exec() {} +#define node_nr_running_init() {} +#endif + +extern void set_user_nice(task_t *p, long nice); +extern int task_prio(task_t *p); +extern int task_nice(task_t *p); +extern int task_curr(task_t *p); +extern int idle_cpu(int cpu); + +void yield(void); + +/* + * The default (Linux) execution domain. + */ +extern struct exec_domain default_exec_domain; + +#ifndef INIT_THREAD_SIZE +# define INIT_THREAD_SIZE 2048*sizeof(long) +#endif + +union thread_union { + struct thread_info thread_info; + unsigned long stack[INIT_THREAD_SIZE/sizeof(long)]; +}; + +#ifndef __HAVE_ARCH_KSTACK_END +static inline int kstack_end(void *addr) +{ + /* Reliable end of stack detection: + * Some APM bios versions misalign the stack + */ + return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); +} +#endif + +extern union thread_union init_thread_union; +extern struct task_struct init_task; + +extern struct mm_struct init_mm; + +extern struct task_struct *find_task_by_pid(int pid); +extern void set_special_pids(pid_t session, pid_t pgrp); +extern void __set_special_pids(pid_t session, pid_t pgrp); + +/* per-UID process charging. */ +extern struct user_struct * alloc_uid(uid_t); +extern void free_uid(struct user_struct *); +extern void switch_uid(struct user_struct *); + +#include <asm/current.h> + +extern unsigned long itimer_ticks; +extern unsigned long itimer_next; +extern void do_timer(struct pt_regs *); + +extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); +extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk)); +extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); +extern void FASTCALL(sched_exit(task_t * p)); + +asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); + +extern int in_group_p(gid_t); +extern int in_egroup_p(gid_t); + +extern void proc_caches_init(void); +extern void flush_signals(struct task_struct *); +extern void flush_signal_handlers(struct task_struct *, int force_default); +extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); + +static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + ret = dequeue_signal(tsk, mask, info); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + + return ret; +} + +extern void block_all_signals(int (*notifier)(void *priv), void *priv, + sigset_t *mask); +extern void unblock_all_signals(void); +extern void release_task(struct task_struct * p); +extern int send_sig_info(int, struct siginfo *, struct task_struct *); +extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); +extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp); +extern int kill_pg_info(int, struct siginfo *, pid_t); +extern int kill_sl_info(int, struct siginfo *, pid_t); +extern int kill_proc_info(int, struct siginfo *, pid_t); +extern void notify_parent(struct task_struct *, int); +extern void do_notify_parent(struct task_struct *, int); +extern void force_sig(int, struct task_struct *); +extern void force_sig_specific(int, struct task_struct *); +extern int send_sig(int, struct task_struct *, int); +extern void zap_other_threads(struct task_struct *p); +extern int kill_pg(pid_t, int, int); +extern int kill_sl(pid_t, int, int); +extern int kill_proc(pid_t, int, int); +extern struct sigqueue *sigqueue_alloc(void); +extern void sigqueue_free(struct sigqueue *); +extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); +extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); +extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); +extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); + +/* These can be the second arg to send_sig_info/send_group_sig_info. */ +#define SEND_SIG_NOINFO ((struct siginfo *) 0) +#define SEND_SIG_PRIV ((struct siginfo *) 1) +#define SEND_SIG_FORCED ((struct siginfo *) 2) + +/* True if we are on the alternate signal stack. */ + +static inline int on_sig_stack(unsigned long sp) +{ + return (sp - current->sas_ss_sp < current->sas_ss_size); +} + +static inline int sas_ss_flags(unsigned long sp) +{ + return (current->sas_ss_size == 0 ? SS_DISABLE + : on_sig_stack(sp) ? SS_ONSTACK : 0); +} + + +#ifdef CONFIG_SECURITY +/* code is in security.c */ +extern int capable(int cap); +#else +static inline int capable(int cap) +{ + if (cap_raised(current->cap_effective, cap)) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; +} +#endif + +/* + * Routines for handling mm_structs + */ +extern struct mm_struct * mm_alloc(void); + +/* mmdrop drops the mm and the page tables */ +extern inline void FASTCALL(__mmdrop(struct mm_struct *)); +static inline void mmdrop(struct mm_struct * mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop(mm); +} + +/* mmput gets rid of the mappings and all user-space */ +extern void mmput(struct mm_struct *); +/* Grab a reference to the mm if its not already going away */ +extern struct mm_struct *mmgrab(struct mm_struct *); +/* Remove the current tasks stale references to the old mm_struct */ +extern void mm_release(struct task_struct *, struct mm_struct *); + +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern void flush_thread(void); +extern void exit_thread(void); + +extern void exit_mm(struct task_struct *); +extern void exit_files(struct task_struct *); +extern void exit_signal(struct task_struct *); +extern void __exit_signal(struct task_struct *); +extern void exit_sighand(struct task_struct *); +extern void __exit_sighand(struct task_struct *); +extern void exit_itimers(struct task_struct *); + +extern NORET_TYPE void do_group_exit(int); + +extern void reparent_to_init(void); +extern void daemonize(const char *, ...); +extern int allow_signal(int); +extern int disallow_signal(int); +extern task_t *child_reaper; + +extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); +extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern struct task_struct * copy_process(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); + +#ifdef CONFIG_SMP +extern void wait_task_inactive(task_t * p); +#else +#define wait_task_inactive(p) do { } while (0) +#endif + +#define remove_parent(p) list_del_init(&(p)->sibling) +#define add_parent(p, parent) list_add_tail(&(p)->sibling,&(parent)->children) + +#define REMOVE_LINKS(p) do { \ + if (thread_group_leader(p)) \ + list_del_init(&(p)->tasks); \ + remove_parent(p); \ + } while (0) + +#define SET_LINKS(p) do { \ + if (thread_group_leader(p)) \ + list_add_tail(&(p)->tasks,&init_task.tasks); \ + add_parent(p, (p)->parent); \ + } while (0) + +#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) +#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) + +#define for_each_process(p) \ + for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + +/* + * Careful: do_each_thread/while_each_thread is a double loop so + * 'break' will not work as expected - use goto instead. + */ +#define do_each_thread(g, t) \ + for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do + +#define while_each_thread(g, t) \ + while ((t = next_thread(t)) != g) + +extern task_t * FASTCALL(next_thread(task_t *p)); + +#define thread_group_leader(p) (p->pid == p->tgid) + +static inline int thread_group_empty(task_t *p) +{ + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; + + return pid->task_list.next->next == &pid->task_list; +} + +#define delay_group_leader(p) \ + (thread_group_leader(p) && !thread_group_empty(p)) + +extern void unhash_process(struct task_struct *p); + +/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), + * neither inside nor outside. + */ +static inline void task_lock(struct task_struct *p) +{ + spin_lock(&p->alloc_lock); +} + +static inline void task_unlock(struct task_struct *p) +{ + spin_unlock(&p->alloc_lock); +} + +/** + * get_task_mm - acquire a reference to the task's mm + * + * Returns %NULL if the task has no mm. User must release + * the mm via mmput() after use. + */ +static inline struct mm_struct * get_task_mm(struct task_struct * task) +{ + struct mm_struct * mm; + + task_lock(task); + mm = task->mm; + if (mm) + mm = mmgrab(mm); + task_unlock(task); + + return mm; +} + + +/* set thread flags in other task's structures + * - see asm/thread_info.h for TIF_xxxx flags available + */ +static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + set_ti_thread_flag(tsk->thread_info,flag); +} + +static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + clear_ti_thread_flag(tsk->thread_info,flag); +} + +static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_and_set_ti_thread_flag(tsk->thread_info,flag); +} + +static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_and_clear_ti_thread_flag(tsk->thread_info,flag); +} + +static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) +{ + return test_ti_thread_flag(tsk->thread_info,flag); +} + +static inline void set_tsk_need_resched(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); +} + +static inline void clear_tsk_need_resched(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); +} + +static inline int signal_pending(struct task_struct *p) +{ + return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); +} + +static inline int need_resched(void) +{ + return unlikely(test_thread_flag(TIF_NEED_RESCHED)); +} + +extern void __cond_resched(void); +static inline void cond_resched(void) +{ + if (need_resched()) + __cond_resched(); +} + +/* + * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +static inline void cond_resched_lock(spinlock_t * lock) +{ + if (need_resched()) { + _raw_spin_unlock(lock); + preempt_enable_no_resched(); + __cond_resched(); + spin_lock(lock); + } +} + +/* Reevaluate whether the task has signals pending delivery. + This is required every time the blocked sigset_t changes. + callers must hold sighand->siglock. */ + +extern FASTCALL(void recalc_sigpending_tsk(struct task_struct *t)); +extern void recalc_sigpending(void); + +extern void signal_wake_up(struct task_struct *t, int resume_stopped); + +/* + * Wrappers for p->thread_info->cpu access. No-op on UP. + */ +#ifdef CONFIG_SMP + +static inline unsigned int task_cpu(struct task_struct *p) +{ + return p->thread_info->cpu; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + p->thread_info->cpu = cpu; +} + +#else + +static inline unsigned int task_cpu(struct task_struct *p) +{ + return 0; +} + +static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ +} + +#endif /* CONFIG_SMP */ + +#endif /* __KERNEL__ */ + +#endif diff -Nru a/include/linux/serial.h b/include/linux/serial.h --- a/include/linux/serial.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/serial.h Fri Oct 31 14:10:53 2003 @@ -49,7 +49,6 @@ unsigned short iomem_reg_shift; unsigned int port_high; unsigned long iomap_base; /* cookie passed into ioremap */ - int reserved[1]; }; /* diff -Nru a/include/linux/socket.h b/include/linux/socket.h --- a/include/linux/socket.h Fri Oct 31 14:10:54 2003 +++ b/include/linux/socket.h Fri Oct 31 14:10:54 2003 @@ -1,6 +1,21 @@ #ifndef _LINUX_SOCKET_H #define _LINUX_SOCKET_H +/* + * Desired design of maximum size and alignment (see RFC2553) + */ +#define _K_SS_MAXSIZE 128 /* Implementation specific max size */ +#define _K_SS_ALIGNSIZE (__alignof__ (struct sockaddr *)) + /* Implementation specific desired alignment */ + +struct __kernel_sockaddr_storage { + unsigned short ss_family; /* address family */ + /* Following field(s) are implementation specific */ + char __data[_K_SS_MAXSIZE - sizeof(unsigned short)]; + /* space to achieve desired size, */ + /* _SS_MAXSIZE value minus size of ss_family */ +} __attribute__ ((aligned(_K_SS_ALIGNSIZE))); /* force desired alignment */ + #if defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2) #include <linux/config.h> /* for CONFIG_COMPAT */ @@ -27,20 +42,7 @@ int l_linger; /* How long to linger for */ }; -/* - * Desired design of maximum size and alignment (see RFC2553) - */ -#define _SS_MAXSIZE 128 /* Implementation specific max size */ -#define _SS_ALIGNSIZE (__alignof__ (struct sockaddr *)) - /* Implementation specific desired alignment */ - -struct sockaddr_storage { - sa_family_t ss_family; /* address family */ - /* Following field(s) are implementation specific */ - char __data[_SS_MAXSIZE - sizeof(sa_family_t)]; - /* space to achieve desired size, */ - /* _SS_MAXSIZE value minus size of ss_family */ -} __attribute__ ((aligned(_SS_ALIGNSIZE))); /* force desired alignment */ +#define sockaddr_storage __kernel_sockaddr_storage /* * As we do 4.4BSD message passing we use a 4.4BSD message passing diff -Nru a/include/linux/spinlock.h b/include/linux/spinlock.h --- a/include/linux/spinlock.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/spinlock.h Fri Oct 31 14:10:53 2003 @@ -180,8 +180,8 @@ #define _raw_read_unlock(lock) do { (void)(lock); } while(0) #define _raw_write_lock(lock) do { (void)(lock); } while(0) #define _raw_write_unlock(lock) do { (void)(lock); } while(0) -#define _raw_write_trylock(lock) ({ (void)(lock); (1); }) - +#define _raw_write_trylock(lock) ({ (void)(lock); (1); }) +#define spin_lock_dont_check(lock) #endif /* !SMP */ /* diff -Nru a/include/linux/spinlock.h~spinlock-owner.diff b/include/linux/spinlock.h~spinlock-owner.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/spinlock.h~spinlock-owner.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,468 @@ +#ifndef __LINUX_SPINLOCK_H +#define __LINUX_SPINLOCK_H + +/* + * include/linux/spinlock.h - generic locking declarations + */ + +#include <linux/config.h> +#include <linux/preempt.h> +#include <linux/linkage.h> +#include <linux/compiler.h> +#include <linux/thread_info.h> +#include <linux/kernel.h> +#include <linux/stringify.h> + +#include <asm/processor.h> /* for cpu relax */ +#include <asm/system.h> + +/* + * Must define these before including other files, inline functions need them + */ +#define LOCK_SECTION_NAME \ + ".text.lock." __stringify(KBUILD_BASENAME) + +#define LOCK_SECTION_START(extra) \ + ".subsection 1\n\t" \ + extra \ + ".ifndef " LOCK_SECTION_NAME "\n\t" \ + LOCK_SECTION_NAME ":\n\t" \ + ".endif\n\t" + +#define LOCK_SECTION_END \ + ".previous\n\t" + +/* + * If CONFIG_SMP is set, pull in the _raw_* definitions + */ +#ifdef CONFIG_SMP +#include <asm/spinlock.h> + +#else + +#if !defined(CONFIG_PREEMPT) && !defined(CONFIG_DEBUG_SPINLOCK) +# define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic) +# define ATOMIC_DEC_AND_LOCK +#endif + +#ifdef CONFIG_DEBUG_SPINLOCK + +#define SPINLOCK_MAGIC 0x1D244B3C +typedef struct { + unsigned long magic; + volatile unsigned long lock; + volatile unsigned int babble; + const char *module; + char *owner; + int oline; +} spinlock_t; +#define SPIN_LOCK_UNLOCKED (spinlock_t) { SPINLOCK_MAGIC, 0, 10, __FILE__ , NULL, 0} + +#define spin_lock_init(x) \ + do { \ + (x)->magic = SPINLOCK_MAGIC; \ + (x)->lock = 0; \ + (x)->babble = 5; \ + (x)->module = __FILE__; \ + (x)->owner = NULL; \ + (x)->oline = 0; \ + } while (0) + +#define CHECK_LOCK(x) \ + do { \ + if ((x)->magic != SPINLOCK_MAGIC) { \ + printk(KERN_ERR "%s:%d: spin_is_locked on uninitialized spinlock %p.\n", \ + __FILE__, __LINE__, (x)); \ + } \ + } while(0) + +#define _raw_spin_lock(x) \ + do { \ + CHECK_LOCK(x); \ + if ((x)->lock&&(x)->babble) { \ + (x)->babble--; \ + printk("%s:%d: spin_lock(%s:%p) already locked by %s/%d\n", \ + __FILE__,__LINE__, (x)->module, \ + (x), (x)->owner, (x)->oline); \ + } \ + (x)->lock = 1; \ + (x)->owner = __FILE__; \ + (x)->oline = __LINE__; \ + } while (0) + +/* without debugging, spin_is_locked on UP always says + * FALSE. --> printk if already locked. */ +#define spin_is_locked(x) \ + ({ \ + CHECK_LOCK(x); \ + if ((x)->lock&&(x)->babble) { \ + (x)->babble--; \ + printk("%s:%d: spin_is_locked(%s:%p) already locked by %s/%d\n", \ + __FILE__,__LINE__, (x)->module, \ + (x), (x)->owner, (x)->oline); \ + } \ + 0; \ + }) + +/* without debugging, spin_trylock on UP always says + * TRUE. --> printk if already locked. */ +#define _raw_spin_trylock(x) \ + ({ \ + CHECK_LOCK(x); \ + if ((x)->lock&&(x)->babble) { \ + (x)->babble--; \ + printk("%s:%d: spin_trylock(%s:%p) already locked by %s/%d\n", \ + __FILE__,__LINE__, (x)->module, \ + (x), (x)->owner, (x)->oline); \ + } \ + (x)->lock = 1; \ + (x)->owner = __FILE__; \ + (x)->oline = __LINE__; \ + 1; \ + }) + +#define spin_unlock_wait(x) \ + do { \ + CHECK_LOCK(x); \ + if ((x)->lock&&(x)->babble) { \ + (x)->babble--; \ + printk("%s:%d: spin_unlock_wait(%s:%p) owned by %s/%d\n", \ + __FILE__,__LINE__, (x)->module, (x), \ + (x)->owner, (x)->oline); \ + }\ + } while (0) + +#define _raw_spin_unlock(x) \ + do { \ + CHECK_LOCK(x); \ + if (!(x)->lock&&(x)->babble) { \ + (x)->babble--; \ + printk("%s:%d: spin_unlock(%s:%p) not locked\n", \ + __FILE__,__LINE__, (x)->module, (x));\ + } \ + (x)->lock = 0; \ + } while (0) +#else +/* + * gcc versions before ~2.95 have a nasty bug with empty initializers. + */ +#if (__GNUC__ > 2) + typedef struct { } spinlock_t; + #define SPIN_LOCK_UNLOCKED (spinlock_t) { } +#else + typedef struct { int gcc_is_buggy; } spinlock_t; + #define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 } +#endif + +/* + * If CONFIG_SMP is unset, declare the _raw_* definitions as nops + */ +#define spin_lock_init(lock) do { (void)(lock); } while(0) +#define _raw_spin_lock(lock) do { (void)(lock); } while(0) +#define spin_is_locked(lock) ((void)(lock), 0) +#define _raw_spin_trylock(lock) ((void)(lock), 1) +#define spin_unlock_wait(lock) do { (void)(lock); } while(0) +#define _raw_spin_unlock(lock) do { (void)(lock); } while(0) +#endif /* CONFIG_DEBUG_SPINLOCK */ + +/* RW spinlocks: No debug version */ + +#if (__GNUC__ > 2) + typedef struct { } rwlock_t; + #define RW_LOCK_UNLOCKED (rwlock_t) { } +#else + typedef struct { int gcc_is_buggy; } rwlock_t; + #define RW_LOCK_UNLOCKED (rwlock_t) { 0 } +#endif + +#define rwlock_init(lock) do { (void)(lock); } while(0) +#define _raw_read_lock(lock) do { (void)(lock); } while(0) +#define _raw_read_unlock(lock) do { (void)(lock); } while(0) +#define _raw_write_lock(lock) do { (void)(lock); } while(0) +#define _raw_write_unlock(lock) do { (void)(lock); } while(0) +#define _raw_write_trylock(lock) ({ (void)(lock); (1); }) + +#endif /* !SMP */ + +/* + * Define the various spin_lock and rw_lock methods. Note we define these + * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various + * methods are defined as nops in the case they are not required. + */ +#define spin_trylock(lock) ({preempt_disable(); _raw_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) + +#define write_trylock(lock) ({preempt_disable();_raw_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) + +/* Where's read_trylock? */ + +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) +void __preempt_spin_lock(spinlock_t *lock); +void __preempt_write_lock(rwlock_t *lock); + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + if (unlikely(!_raw_spin_trylock(lock))) \ + __preempt_spin_lock(lock); \ +} while (0) + +#define write_lock(lock) \ +do { \ + preempt_disable(); \ + if (unlikely(!_raw_write_trylock(lock))) \ + __preempt_write_lock(lock); \ +} while (0) + +#else +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _raw_spin_lock(lock); \ +} while(0) + +#define write_lock(lock) \ +do { \ + preempt_disable(); \ + _raw_write_lock(lock); \ +} while(0) +#endif + +#define read_lock(lock) \ +do { \ + preempt_disable(); \ + _raw_read_lock(lock); \ +} while(0) + +#define spin_unlock(lock) \ +do { \ + _raw_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define write_unlock(lock) \ +do { \ + _raw_write_unlock(lock); \ + preempt_enable(); \ +} while(0) + +#define read_unlock(lock) \ +do { \ + _raw_read_unlock(lock); \ + preempt_enable(); \ +} while(0) + +#define spin_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _raw_spin_lock(lock); \ +} while (0) + +#define spin_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _raw_spin_lock(lock); \ +} while (0) + +#define spin_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _raw_spin_lock(lock); \ +} while (0) + +#define read_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _raw_read_lock(lock); \ +} while (0) + +#define read_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _raw_read_lock(lock); \ +} while (0) + +#define read_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _raw_read_lock(lock); \ +} while (0) + +#define write_lock_irqsave(lock, flags) \ +do { \ + local_irq_save(flags); \ + preempt_disable(); \ + _raw_write_lock(lock); \ +} while (0) + +#define write_lock_irq(lock) \ +do { \ + local_irq_disable(); \ + preempt_disable(); \ + _raw_write_lock(lock); \ +} while (0) + +#define write_lock_bh(lock) \ +do { \ + local_bh_disable(); \ + preempt_disable(); \ + _raw_write_lock(lock); \ +} while (0) + +#define spin_unlock_irqrestore(lock, flags) \ +do { \ + _raw_spin_unlock(lock); \ + local_irq_restore(flags); \ + preempt_enable(); \ +} while (0) + +#define _raw_spin_unlock_irqrestore(lock, flags) \ +do { \ + _raw_spin_unlock(lock); \ + local_irq_restore(flags); \ +} while (0) + +#define spin_unlock_irq(lock) \ +do { \ + _raw_spin_unlock(lock); \ + local_irq_enable(); \ + preempt_enable(); \ +} while (0) + +#define spin_unlock_bh(lock) \ +do { \ + _raw_spin_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#define read_unlock_irqrestore(lock, flags) \ +do { \ + _raw_read_unlock(lock); \ + local_irq_restore(flags); \ + preempt_enable(); \ +} while (0) + +#define read_unlock_irq(lock) \ +do { \ + _raw_read_unlock(lock); \ + local_irq_enable(); \ + preempt_enable(); \ +} while (0) + +#define read_unlock_bh(lock) \ +do { \ + _raw_read_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#define write_unlock_irqrestore(lock, flags) \ +do { \ + _raw_write_unlock(lock); \ + local_irq_restore(flags); \ + preempt_enable(); \ +} while (0) + +#define write_unlock_irq(lock) \ +do { \ + _raw_write_unlock(lock); \ + local_irq_enable(); \ + preempt_enable(); \ +} while (0) + +#define write_unlock_bh(lock) \ +do { \ + _raw_write_unlock(lock); \ + preempt_enable(); \ + local_bh_enable(); \ +} while (0) + +#define spin_trylock_bh(lock) ({ local_bh_disable(); preempt_disable(); \ + _raw_spin_trylock(lock) ? 1 : \ + ({preempt_enable(); local_bh_enable(); 0;});}) + +/* "lock on reference count zero" */ +#ifndef ATOMIC_DEC_AND_LOCK +#include <asm/atomic.h> +extern int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); +#endif + +/* + * bit-based spin_lock() + * + * Don't use this unless you really need to: spin_lock() and spin_unlock() + * are significantly faster. + */ +static inline void bit_spin_lock(int bitnum, unsigned long *addr) +{ + /* + * Assuming the lock is uncontended, this never enters + * the body of the outer loop. If it is contended, then + * within the inner loop a non-atomic test is used to + * busywait with less bus contention for a good time to + * attempt to acquire the lock bit. + */ + preempt_disable(); +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + while (test_and_set_bit(bitnum, addr)) { + while (test_bit(bitnum, addr)) + cpu_relax(); + } +#endif +} + +/* + * Return true if it was acquired + */ +static inline int bit_spin_trylock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + int ret; + + preempt_disable(); + ret = !test_and_set_bit(bitnum, addr); + if (!ret) + preempt_enable(); + return ret; +#else + preempt_disable(); + return 1; +#endif +} + +/* + * bit-based spin_unlock() + */ +static inline void bit_spin_unlock(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + BUG_ON(!test_bit(bitnum, addr)); + smp_mb__before_clear_bit(); + clear_bit(bitnum, addr); +#endif + preempt_enable(); +} + +/* + * Return true if the lock is held. + */ +static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) +{ +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) + return test_bit(bitnum, addr); +#elif defined CONFIG_PREEMPT + return preempt_count(); +#else + return 1; +#endif +} + +#endif /* __LINUX_SPINLOCK_H */ diff -Nru a/include/linux/sysctl.h b/include/linux/sysctl.h --- a/include/linux/sysctl.h Fri Oct 31 14:10:54 2003 +++ b/include/linux/sysctl.h Fri Oct 31 14:10:54 2003 @@ -217,7 +217,8 @@ NET_CORE_NO_CONG=14, NET_CORE_LO_CONG=15, NET_CORE_MOD_CONG=16, - NET_CORE_DEV_WEIGHT=17 + NET_CORE_DEV_WEIGHT=17, + NET_CORE_SOMAXCONN=18, }; /* /proc/sys/net/ethernet */ diff -Nru a/include/linux/udp.h b/include/linux/udp.h --- a/include/linux/udp.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/udp.h Fri Oct 31 14:10:53 2003 @@ -44,13 +44,9 @@ unsigned int corkflag; /* Cork is required */ __u16 encap_type; /* Is this an Encapsulation socket? */ /* - * Following members retains the infomation to create a UDP header + * Following member retains the infomation to create a UDP header * when the socket is uncorked. */ - u32 saddr; /* source address */ - u32 daddr; /* destination address */ - __u16 sport; /* source port */ - __u16 dport; /* destination port */ __u16 len; /* total length of pending frames */ }; diff -Nru a/include/linux/writeback.h b/include/linux/writeback.h --- a/include/linux/writeback.h Fri Oct 31 14:10:53 2003 +++ b/include/linux/writeback.h Fri Oct 31 14:10:53 2003 @@ -55,6 +55,7 @@ * fs/fs-writeback.c */ void writeback_inodes(struct writeback_control *wbc); +void generic_sync_sb_inodes(struct super_block * sb, struct writeback_control * wbc); void wake_up_inode(struct inode *inode); void __wait_on_inode(struct inode * inode); void sync_inodes_sb(struct super_block *, int wait); diff -Nru a/include/linux/writeback.h~sb_sync_inodes.diff b/include/linux/writeback.h~sb_sync_inodes.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/writeback.h~sb_sync_inodes.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,96 @@ +/* + * include/linux/writeback.h. + */ +#ifndef WRITEBACK_H +#define WRITEBACK_H + +struct backing_dev_info; + +extern spinlock_t inode_lock; +extern struct list_head inode_in_use; +extern struct list_head inode_unused; + +/* + * Yes, writeback.h requires sched.h + * No, sched.h is not included from here. + */ +static inline int current_is_pdflush(void) +{ + return current->flags & PF_FLUSHER; +} + +/* + * fs/fs-writeback.c + */ +enum writeback_sync_modes { + WB_SYNC_NONE, /* Don't wait on anything */ + WB_SYNC_ALL, /* Wait on every mapping */ + WB_SYNC_HOLD, /* Hold the inode on sb_dirty for sys_sync() */ +}; + +/* + * A control structure which tells the writeback code what to do + */ +struct writeback_control { + struct backing_dev_info *bdi; /* If !NULL, only write back this + queue */ + enum writeback_sync_modes sync_mode; + unsigned long *older_than_this; /* If !NULL, only write back inodes + older than this */ + long nr_to_write; /* Write this many pages, and decrement + this for each page written */ + int nonblocking; /* Don't get stuck on request queues */ + int encountered_congestion; /* An output: a queue is full */ + int for_kupdate; /* A kupdate writeback */ + int for_reclaim; /* Invoked from the page allocator */ +}; + +/* + * ->writepage() return values (make these much larger than a pagesize, in + * case some fs is returning number-of-bytes-written from writepage) + */ +#define WRITEPAGE_ACTIVATE 0x80000 /* IO was not started: activate page */ + +/* + * fs/fs-writeback.c + */ +void writeback_inodes(struct writeback_control *wbc); +void wake_up_inode(struct inode *inode); +void __wait_on_inode(struct inode * inode); +void sync_inodes_sb(struct super_block *, int wait); +void sync_inodes(int wait); + +/* writeback.h requires fs.h; it, too, is not included from here. */ +static inline void wait_on_inode(struct inode *inode) +{ + if (inode->i_state & I_LOCK) + __wait_on_inode(inode); +} + +/* + * mm/page-writeback.c + */ +int wakeup_bdflush(long nr_pages); + +/* These 5 are exported to sysctl. */ +extern int dirty_background_ratio; +extern int vm_dirty_ratio; +extern int dirty_writeback_centisecs; +extern int dirty_expire_centisecs; + +struct ctl_table; +struct file; +int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *); + +void page_writeback_init(void); +void balance_dirty_pages_ratelimited(struct address_space *mapping); +int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); +int do_writepages(struct address_space *mapping, struct writeback_control *wbc); + +/* pdflush.c */ +extern int nr_pdflush_threads; /* Global so it can be exported to sysctl + read-only. */ + + +#endif /* WRITEBACK_H */ diff -Nru a/include/net/if_inet6.h b/include/net/if_inet6.h --- a/include/net/if_inet6.h Fri Oct 31 14:10:54 2003 +++ b/include/net/if_inet6.h Fri Oct 31 14:10:54 2003 @@ -175,6 +175,8 @@ u8 entropy[8]; struct timer_list regen_timer; struct inet6_ifaddr *tempaddr_list; + __u8 work_eui64[8]; + __u8 work_digest[16]; #endif struct neigh_parms *nd_parms; diff -Nru a/include/net/tcp.h b/include/net/tcp.h --- a/include/net/tcp.h Fri Oct 31 14:10:53 2003 +++ b/include/net/tcp.h Fri Oct 31 14:10:53 2003 @@ -219,6 +219,7 @@ #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct in6_addr tw_v6_daddr; struct in6_addr tw_v6_rcv_saddr; + int tw_v6_ipv6only; #endif }; @@ -266,6 +267,38 @@ hlist_for_each_entry_safe(tw, node, safe, jail, tw_death_node) #define tcptw_sk(__sk) ((struct tcp_tw_bucket *)(__sk)) + +static inline const u32 tcp_v4_rcv_saddr(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr; +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static inline const struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + &inet6_sk(sk)->rcv_saddr : &tcptw_sk(sk)->tw_v6_rcv_saddr; +} + +static inline const struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) +{ + return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; +} + +#define tcptw_sk_ipv6only(__sk) (tcptw_sk(__sk)->tw_v6_ipv6only) + +static inline int tcp_v6_ipv6only(const struct sock *sk) +{ + return likely(sk->sk_state != TCP_TIME_WAIT) ? + ipv6_only_sock(sk) : tcptw_sk_ipv6only(sk); +} +#else +# define __tcp_v6_rcv_saddr(__sk) NULL +# define tcp_v6_rcv_saddr(__sk) NULL +# define tcptw_sk_ipv6only(__sk) 0 +# define tcp_v6_ipv6only(__sk) 0 +#endif extern kmem_cache_t *tcp_timewait_cachep; diff -Nru a/kernel/exit.c b/kernel/exit.c --- a/kernel/exit.c Fri Oct 31 14:10:54 2003 +++ b/kernel/exit.c Fri Oct 31 14:10:54 2003 @@ -594,6 +594,7 @@ */ static void exit_notify(struct task_struct *tsk) { + int state; struct task_struct *t; if (signal_pending(tsk) && !tsk->signal->group_exit @@ -687,7 +688,12 @@ do_notify_parent(tsk, SIGCHLD); } - tsk->state = TASK_ZOMBIE; + state = TASK_ZOMBIE; + if (tsk->exit_signal == -1 && tsk->ptrace == 0) + state = TASK_DEAD; + tsk->state = state; + tsk->flags |= PF_DEAD; + /* * In the preemption case it must be impossible for the task * to get runnable again, so use "_raw_" unlock to keep @@ -702,6 +708,11 @@ */ _raw_write_unlock(&tasklist_lock); local_irq_enable(); + + /* If the process is dead, release it - nobody will wait for it */ + if (state == TASK_DEAD) + release_task(tsk); + } NORET_TYPE void do_exit(long code) @@ -750,10 +761,6 @@ tsk->exit_code = code; exit_notify(tsk); - - if (tsk->exit_signal == -1 && tsk->ptrace == 0) - release_task(tsk); - schedule(); BUG(); /* Avoid "noreturn function does return". */ diff -Nru a/kernel/module.c b/kernel/module.c --- a/kernel/module.c Fri Oct 31 14:10:54 2003 +++ b/kernel/module.c Fri Oct 31 14:10:54 2003 @@ -1658,7 +1658,7 @@ NULL); } if (err < 0) - goto cleanup; + goto arch_cleanup; /* Get rid of temporary copy */ vfree(hdr); @@ -1666,6 +1666,8 @@ /* Done! */ return mod; + arch_cleanup: + module_arch_cleanup(mod); cleanup: module_unload_free(mod); module_free(mod, mod->module_init); diff -Nru a/kernel/sched.c b/kernel/sched.c --- a/kernel/sched.c Fri Oct 31 14:10:54 2003 +++ b/kernel/sched.c Fri Oct 31 14:10:54 2003 @@ -742,7 +742,7 @@ { runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; - int drop_task_ref; + unsigned long prev_task_flags; rq->prev_mm = NULL; @@ -757,14 +757,11 @@ * be dropped twice. * Manfred Spraul <manfred@colorfullife.com> */ - drop_task_ref = 0; - if (unlikely(prev->state & (TASK_DEAD | TASK_ZOMBIE))) - drop_task_ref = 1; - + prev_task_flags = prev->flags; finish_arch_switch(rq, prev); if (mm) mmdrop(mm); - if (drop_task_ref) + if (unlikely(prev_task_flags & PF_DEAD)) put_task_struct(prev); } @@ -2809,6 +2806,7 @@ rq->active = rq->arrays; rq->expired = rq->arrays + 1; spin_lock_init(&rq->lock); + spin_lock_dont_check(&rq->lock); INIT_LIST_HEAD(&rq->migration_queue); atomic_set(&rq->nr_iowait, 0); nr_running_init(rq); diff -Nru a/kernel/sched.c~spinlock-owner.diff b/kernel/sched.c~spinlock-owner.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/kernel/sched.c~spinlock-owner.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,2910 @@ +/* + * kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + */ + +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <asm/uaccess.h> +#include <linux/highmem.h> +#include <linux/smp_lock.h> +#include <asm/mmu_context.h> +#include <linux/interrupt.h> +#include <linux/completion.h> +#include <linux/kernel_stat.h> +#include <linux/security.h> +#include <linux/notifier.h> +#include <linux/suspend.h> +#include <linux/blkdev.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/rcupdate.h> +#include <linux/cpu.h> +#include <linux/percpu.h> + +#ifdef CONFIG_NUMA +#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) +#else +#define cpu_to_node_mask(cpu) (cpu_online_map) +#endif + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define AVG_TIMESLICE (MIN_TIMESLICE + ((MAX_TIMESLICE - MIN_TIMESLICE) *\ + (MAX_PRIO-1-NICE_TO_PRIO(0))/(MAX_USER_PRIO - 1))) + +/* + * Some helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) + +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, + * maximum timeslice is 200 msecs. Timeslices get refilled after + * they expire. + */ +#define MIN_TIMESLICE ( 10 * HZ / 1000) +#define MAX_TIMESLICE (200 * HZ / 1000) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define INTERACTIVE_DELTA 2 +#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) +#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) +#define NODE_THRESHOLD 125 +#define CREDIT_LIMIT 100 + +/* + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. + * + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: + * + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. + */ + +#define CURRENT_BONUS(p) \ + (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ + MAX_SLEEP_AVG) + +#ifdef CONFIG_SMP +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ + num_online_cpus()) +#else +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) +#endif + +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) + +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ + INTERACTIVE_DELTA) + +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) + +#define JUST_INTERACTIVE_SLEEP(p) \ + (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ + (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) + +#define HIGH_CREDIT(p) \ + ((p)->interactive_credit > CREDIT_LIMIT) + +#define LOW_CREDIT(p) \ + ((p)->interactive_credit < -CREDIT_LIMIT) + +#define TASK_PREEMPTS_CURR(p, rq) \ + ((p)->prio < (rq)->curr->prio) + +/* + * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] + * to time slice values. + * + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. + * + * task_timeslice() is the interface that is used by the scheduler. + */ + +#define BASE_TIMESLICE(p) (MIN_TIMESLICE + \ + ((MAX_TIMESLICE - MIN_TIMESLICE) * (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1))) + +static inline unsigned int task_timeslice(task_t *p) +{ + return BASE_TIMESLICE(p); +} + +/* + * These are the runqueue data structures: + */ + +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) + +typedef struct runqueue runqueue_t; + +struct prio_array { + int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + struct list_head queue[MAX_PRIO]; +}; + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct runqueue { + spinlock_t lock; + unsigned long nr_running, nr_switches, expired_timestamp, + nr_uninterruptible; + task_t *curr, *idle; + struct mm_struct *prev_mm; + prio_array_t *active, *expired, arrays[2]; + int prev_cpu_load[NR_CPUS]; +#ifdef CONFIG_NUMA + atomic_t *node_nr_running; + int prev_node_load[MAX_NUMNODES]; +#endif + task_t *migration_thread; + struct list_head migration_queue; + + atomic_t nr_iowait; +}; + +static DEFINE_PER_CPU(struct runqueue, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while(0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) +#endif + +#ifdef CONFIG_NUMA + +/* + * Keep track of running tasks. + */ + +static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp = + {[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)}; + +static inline void nr_running_init(struct runqueue *rq) +{ + rq->node_nr_running = &node_nr_running[0]; +} + +static inline void nr_running_inc(runqueue_t *rq) +{ + atomic_inc(rq->node_nr_running); + rq->nr_running++; +} + +static inline void nr_running_dec(runqueue_t *rq) +{ + atomic_dec(rq->node_nr_running); + rq->nr_running--; +} + +__init void node_nr_running_init(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) { + if (cpu_possible(i)) + cpu_rq(i)->node_nr_running = + &node_nr_running[cpu_to_node(i)]; + } +} + +#else /* !CONFIG_NUMA */ + +# define nr_running_init(rq) do { } while (0) +# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0) +# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0) + +#endif /* CONFIG_NUMA */ + +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +{ + struct runqueue *rq; + +repeat_lock_task: + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + goto repeat_lock_task; + } + return rq; +} + +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); +} + +/* + * rq_lock - lock a given runqueue and disable interrupts. + */ +static inline runqueue_t *this_rq_lock(void) +{ + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +static inline void rq_unlock(runqueue_t *rq) +{ + spin_unlock_irq(&rq->lock); +} + +/* + * Adding/removing a task to/from a priority array: + */ +static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} + +static inline void enqueue_task(struct task_struct *p, prio_array_t *array) +{ + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} + +/* + * effective_prio - return the priority that is based on the static + * priority but is modified by bonuses/penalties. + * + * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. + */ +static int effective_prio(task_t *p) +{ + int bonus, prio; + + if (rt_task(p)) + return p->prio; + + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq->active); + nr_running_inc(rq); +} + +static void recalc_task_prio(task_t *p, unsigned long long now) +{ + unsigned long long __sleep_time = now - p->timestamp; + unsigned long sleep_time; + + if (__sleep_time > NS_MAX_SLEEP_AVG) + sleep_time = NS_MAX_SLEEP_AVG; + else + sleep_time = (unsigned long)__sleep_time; + + if (likely(sleep_time > 0)) { + /* + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. + */ + if (p->mm && p->activated != -1 && + sleep_time > JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - + AVG_TIMESLICE); + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks with low interactive_credit are limited to + * one timeslice worth of sleep avg bonus. + */ + if (LOW_CREDIT(p) && + sleep_time > JIFFIES_TO_NS(task_timeslice(p))) + sleep_time = + JIFFIES_TO_NS(task_timeslice(p)); + + /* + * Non high_credit tasks waking from uninterruptible + * sleep are limited in their sleep_avg rise as they + * are likely to be cpu hogs waiting on I/O + */ + if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm){ + if (p->sleep_avg >= JUST_INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = + JUST_INTERACTIVE_SLEEP(p); + sleep_time = 0; + } + } + + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + if (p->sleep_avg > NS_MAX_SLEEP_AVG){ + p->sleep_avg = NS_MAX_SLEEP_AVG; + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } + } + } + + p->prio = effective_prio(p); +} + +/* + * activate_task - move a task to the runqueue and do priority recalculation + * + * Update all the scheduling statistics stuff. (sleep average + * calculation, priority modifiers, etc.) + */ +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + unsigned long long now = sched_clock(); + + recalc_task_prio(p, now); + + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->activated){ + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->activated = 2; + else + /* + * Normal first-time wakeups get a credit too for on-runqueue + * time, but it will be weighted down: + */ + p->activated = 1; + } + p->timestamp = now; + + __activate_task(p, rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + nr_running_dec(rq); + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + dequeue_task(p, p->array); + p->array = NULL; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +static inline void resched_task(task_t *p) +{ +#ifdef CONFIG_SMP + int need_resched, nrpolling; + + preempt_disable(); + /* minimise the chance of sending an interrupt to poll_idle() */ + nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); + nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); + + if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) + smp_send_reschedule(task_cpu(p)); + preempt_enable(); +#else + set_tsk_need_resched(p); +#endif +} + +#ifdef CONFIG_SMP + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +void wait_task_inactive(task_t * p) +{ + unsigned long flags; + runqueue_t *rq; + +repeat: + preempt_disable(); + rq = task_rq(p); + if (unlikely(task_running(rq, p))) { + cpu_relax(); + /* + * enable/disable preemption just to make this + * a preemption point - we are busy-waiting + * anyway. + */ + preempt_enable(); + goto repeat; + } + rq = task_rq_lock(p, &flags); + if (unlikely(task_running(rq, p))) { + task_rq_unlock(rq, &flags); + preempt_enable(); + goto repeat; + } + task_rq_unlock(rq, &flags); + preempt_enable(); +} +#endif + +/*** + * try_to_wake_up - wake up a thread + * @p: the to-be-woken-up thread + * @state: the mask of task states that can be woken + * @sync: do a synchronous wakeup? + * @kick: kick the CPU if the task is already running? + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * returns failure only if the task is already active. + */ +static int try_to_wake_up(task_t * p, unsigned int state, int sync, int kick) +{ + unsigned long flags; + int success = 0; + long old_state; + runqueue_t *rq; + +repeat_lock_task: + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (old_state & state) { + if (!p->array) { + /* + * Fast-migrate the task if it's not running or runnable + * currently. Do not violate hard affinity. + */ + if (unlikely(sync && !task_running(rq, p) && + (task_cpu(p) != smp_processor_id()) && + cpu_isset(smp_processor_id(), p->cpus_allowed))) { + + set_task_cpu(p, smp_processor_id()); + task_rq_unlock(rq, &flags); + goto repeat_lock_task; + } + if (old_state == TASK_UNINTERRUPTIBLE){ + rq->nr_uninterruptible--; + /* + * Tasks on involuntary sleep don't earn + * sleep_avg beyond just interactive state. + */ + p->activated = -1; + } + if (sync) + __activate_task(p, rq); + else { + activate_task(p, rq); + if (TASK_PREEMPTS_CURR(p, rq)) + resched_task(rq->curr); + } + success = 1; + } +#ifdef CONFIG_SMP + else + if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id())) + smp_send_reschedule(task_cpu(p)); +#endif + p->state = TASK_RUNNING; + } + task_rq_unlock(rq, &flags); + + return success; +} + +int wake_up_process(task_t * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 0); +} + +EXPORT_SYMBOL(wake_up_process); + +int wake_up_process_kick(task_t * p) +{ + return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 1); +} + +int wake_up_state(task_t *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0, 0); +} + +/* + * wake_up_forked_process - wake up a freshly forked process. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created process. + */ +void wake_up_forked_process(task_t * p) +{ + unsigned long flags; + runqueue_t *rq = task_rq_lock(current, &flags); + + p->state = TASK_RUNNING; + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. + */ + current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); + + p->interactive_credit = 0; + + p->prio = effective_prio(p); + set_task_cpu(p, smp_processor_id()); + + if (unlikely(!current->array)) + __activate_task(p, rq); + else { + p->prio = current->prio; + list_add_tail(&p->run_list, ¤t->run_list); + p->array = current->array; + p->array->nr_active++; + nr_running_inc(rq); + } + task_rq_unlock(rq, &flags); +} + +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many threads. + * + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) + */ +void sched_exit(task_t * p) +{ + unsigned long flags; + + local_irq_save(flags); + if (p->first_time_slice) { + p->parent->time_slice += p->time_slice; + if (unlikely(p->parent->time_slice > MAX_TIMESLICE)) + p->parent->time_slice = MAX_TIMESLICE; + } + local_irq_restore(flags); + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + if (p->sleep_avg < p->parent->sleep_avg) + p->parent->sleep_avg = p->parent->sleep_avg / + (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / + (EXIT_WEIGHT + 1); +} + +/** + * finish_task_switch - clean up after a task-switch + * @prev: the thread we just switched away from. + * + * We enter this with the runqueue still locked, and finish_arch_switch() + * will unlock it along with doing any other architecture-specific cleanup + * actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static inline void finish_task_switch(task_t *prev) +{ + runqueue_t *rq = this_rq(); + struct mm_struct *mm = rq->prev_mm; + int drop_task_ref; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls + * schedule one last time. The schedule call will never return, + * and the scheduled task must drop that reference. + * The test for TASK_ZOMBIE must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul <manfred@colorfullife.com> + */ + drop_task_ref = 0; + if (unlikely(prev->state & (TASK_DEAD | TASK_ZOMBIE))) + drop_task_ref = 1; + + finish_arch_switch(rq, prev); + if (mm) + mmdrop(mm); + if (drop_task_ref) + put_task_struct(prev); +} + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(task_t *prev) +{ + finish_task_switch(prev); + + if (current->set_child_tid) + put_user(current->pid, current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + WARN_ON(rq->prev_mm); + rq->prev_mm = oldmm; + } + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + return prev; +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for (i = 0; i < NR_CPUS; i++) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + sum += cpu_rq(i)->nr_uninterruptible; + } + return sum; +} + +unsigned long nr_context_switches(void) +{ + unsigned long i, sum = 0; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + sum += cpu_rq(i)->nr_switches; + } + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for (i = 0; i < NR_CPUS; ++i) { + if (!cpu_online(i)) + continue; + sum += atomic_read(&cpu_rq(i)->nr_iowait); + } + return sum; +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) +{ + if (rq1 == rq2) + spin_lock(&rq1->lock); + else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); +} + +#ifdef CONFIG_NUMA +/* + * If dest_cpu is allowed for this process, migrate the task to it. + * This is accomplished by forcing the cpu_allowed mask to only + * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * the cpu_allowed mask is restored. + */ +static void sched_migrate_task(task_t *p, int dest_cpu) +{ + cpumask_t old_mask; + + old_mask = p->cpus_allowed; + if (!cpu_isset(dest_cpu, old_mask)) + return; + /* force the process onto the specified CPU */ + set_cpus_allowed(p, cpumask_of_cpu(dest_cpu)); + + /* restore the cpus allowed mask */ + set_cpus_allowed(p, old_mask); +} + +/* + * Find the least loaded CPU. Slightly favor the current CPU by + * setting its runqueue length as the minimum to start. + */ +static int sched_best_cpu(struct task_struct *p) +{ + int i, minload, load, best_cpu, node = 0; + cpumask_t cpumask; + + best_cpu = task_cpu(p); + if (cpu_rq(best_cpu)->nr_running <= 2) + return best_cpu; + + minload = 10000000; + for_each_node_with_cpus(i) { + /* + * Node load is always divided by nr_cpus_node to normalise + * load values in case cpu count differs from node to node. + * We first multiply node_nr_running by 10 to get a little + * better resolution. + */ + load = 10 * atomic_read(&node_nr_running[i]) / nr_cpus_node(i); + if (load < minload) { + minload = load; + node = i; + } + } + + minload = 10000000; + cpumask = node_to_cpumask(node); + for (i = 0; i < NR_CPUS; ++i) { + if (!cpu_isset(i, cpumask)) + continue; + if (cpu_rq(i)->nr_running < minload) { + best_cpu = i; + minload = cpu_rq(i)->nr_running; + } + } + return best_cpu; +} + +void sched_balance_exec(void) +{ + int new_cpu; + + if (numnodes > 1) { + new_cpu = sched_best_cpu(current); + if (new_cpu != smp_processor_id()) + sched_migrate_task(current, new_cpu); + } +} + +/* + * Find the busiest node. All previous node loads contribute with a + * geometrically deccaying weight to the load measure: + * load_{t} = load_{t-1}/2 + nr_node_running_{t} + * This way sudden load peaks are flattened out a bit. + * Node load is divided by nr_cpus_node() in order to compare nodes + * of different cpu count but also [first] multiplied by 10 to + * provide better resolution. + */ +static int find_busiest_node(int this_node) +{ + int i, node = -1, load, this_load, maxload; + + if (!nr_cpus_node(this_node)) + return node; + this_load = maxload = (this_rq()->prev_node_load[this_node] >> 1) + + (10 * atomic_read(&node_nr_running[this_node]) + / nr_cpus_node(this_node)); + this_rq()->prev_node_load[this_node] = this_load; + for_each_node_with_cpus(i) { + if (i == this_node) + continue; + load = (this_rq()->prev_node_load[i] >> 1) + + (10 * atomic_read(&node_nr_running[i]) + / nr_cpus_node(i)); + this_rq()->prev_node_load[i] = load; + if (load > maxload && (100*load > NODE_THRESHOLD*this_load)) { + maxload = load; + node = i; + } + } + return node; +} + +#endif /* CONFIG_NUMA */ + +#ifdef CONFIG_SMP + +/* + * double_lock_balance - lock the busiest runqueue + * + * this_rq is locked already. Recalculate nr_running if we have to + * drop the runqueue lock. + */ +static inline unsigned int double_lock_balance(runqueue_t *this_rq, + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + /* Need to recalculate nr_running */ + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_cpu_load[this_cpu]; + } else + spin_lock(&busiest->lock); + } + return nr_running; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in cpumask. + */ +static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance, cpumask_t cpumask) +{ + int nr_running, load, max_load, i; + runqueue_t *busiest, *rq_src; + + /* + * We search all runqueues to find the most busy one. + * We do this lockless to reduce cache-bouncing overhead, + * we re-check the 'best' source CPU later on again, with + * the lock held. + * + * We fend off statistical fluctuations in runqueue lengths by + * saving the runqueue length during the previous load-balancing + * operation and using the smaller one the current and saved lengths. + * If a runqueue is long enough for a longer amount of time then + * we recognize it and pull tasks from it. + * + * The 'current runqueue length' is a statistical maximum variable, + * for that one we take the longer one - to avoid fluctuations in + * the other direction. So for a load-balance to happen it needs + * stable long runqueue on the target CPU and stable short runqueue + * on the local runqueue. + * + * We make an exception if this CPU is about to become idle - in + * that case we are less picky about moving a task across CPUs and + * take what can be taken. + */ + if (idle || (this_rq->nr_running > this_rq->prev_cpu_load[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_cpu_load[this_cpu]; + + busiest = NULL; + max_load = 1; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpumask)) + continue; + + rq_src = cpu_rq(i); + if (idle || (rq_src->nr_running < this_rq->prev_cpu_load[i])) + load = rq_src->nr_running; + else + load = this_rq->prev_cpu_load[i]; + this_rq->prev_cpu_load[i] = rq_src->nr_running; + + if ((load > max_load) && (rq_src != this_rq)) { + busiest = rq_src; + max_load = load; + } + } + + if (likely(!busiest)) + goto out; + + *imbalance = max_load - nr_running; + + /* It needs an at least ~25% imbalance to trigger balancing. */ + if (!idle && ((*imbalance)*4 < max_load)) { + busiest = NULL; + goto out; + } + + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); + /* + * Make sure nothing changed since we checked the + * runqueue length. + */ + if (busiest->nr_running <= nr_running) { + spin_unlock(&busiest->lock); + busiest = NULL; + } +out: + return busiest; +} + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) +{ + dequeue_task(p, src_array); + nr_running_dec(src_rq); + set_task_cpu(p, this_cpu); + nr_running_inc(this_rq); + enqueue_task(p, this_rq->active); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (TASK_PREEMPTS_CURR(p, this_rq)) + set_need_resched(); +} + +/* + * Previously: + * + * #define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + * ((!idle || (NS_TO_JIFFIES(now - (p)->timestamp) > \ + * cache_decay_ticks)) && !task_running(rq, p) && \ + * cpu_isset(this_cpu, (p)->cpus_allowed)) + */ + +static inline int +can_migrate_task(task_t *tsk, runqueue_t *rq, int this_cpu, int idle) +{ + unsigned long delta = sched_clock() - tsk->timestamp; + + if (!idle && (delta <= JIFFIES_TO_NS(cache_decay_ticks))) + return 0; + if (task_running(rq, tsk)) + return 0; + if (!cpu_isset(this_cpu, tsk->cpus_allowed)) + return 0; + return 1; +} + +/* + * Current runqueue is empty, or rebalance tick: if there is an + * inbalance (current runqueue is too short) then pull from + * busiest runqueue(s). + * + * We call this with the current runqueue locked, + * irqs disabled. + */ +static void load_balance(runqueue_t *this_rq, int idle, cpumask_t cpumask) +{ + int imbalance, idx, this_cpu = smp_processor_id(); + runqueue_t *busiest; + prio_array_t *array; + struct list_head *head, *curr; + task_t *tmp; + + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask); + if (!busiest) + goto out; + + /* + * We only want to steal a number of tasks equal to 1/2 the imbalance, + * otherwise we'll just shift the imbalance to the new queue: + */ + imbalance /= 2; + + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->expired->nr_active) + array = busiest->expired; + else + array = busiest->active; + +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx >= MAX_PRIO) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; + } + goto out_unlock; + } + + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + + curr = curr->prev; + + if (!can_migrate_task(tmp, busiest, this_cpu, idle)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, array, tmp, this_rq, this_cpu); + if (!idle && --imbalance) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } +out_unlock: + spin_unlock(&busiest->lock); +out: + ; +} + +/* + * One of the idle_cpu_tick() and busy_cpu_tick() functions will + * get called every timer tick, on every CPU. Our balancing action + * frequency and balancing agressivity depends on whether the CPU is + * idle or not. + * + * busy-rebalance every 200 msecs. idle-rebalance every 1 msec. (or on + * systems with HZ=100, every 10 msecs.) + * + * On NUMA, do a node-rebalance every 400 msecs. + */ +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) +#define BUSY_REBALANCE_TICK (HZ/5 ?: 1) +#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5) +#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2) + +#ifdef CONFIG_NUMA +static void balance_node(runqueue_t *this_rq, int idle, int this_cpu) +{ + int node = find_busiest_node(cpu_to_node(this_cpu)); + + if (node >= 0) { + cpumask_t cpumask = node_to_cpumask(node); + cpu_set(this_cpu, cpumask); + spin_lock(&this_rq->lock); + load_balance(this_rq, idle, cpumask); + spin_unlock(&this_rq->lock); + } +} +#endif + +static void rebalance_tick(runqueue_t *this_rq, int idle) +{ +#ifdef CONFIG_NUMA + int this_cpu = smp_processor_id(); +#endif + unsigned long j = jiffies; + + /* + * First do inter-node rebalancing, then intra-node rebalancing, + * if both events happen in the same tick. The inter-node + * rebalancing does not necessarily have to create a perfect + * balance within the node, since we load-balance the most loaded + * node with the current CPU. (ie. other CPUs in the local node + * are not balanced.) + */ + if (idle) { +#ifdef CONFIG_NUMA + if (!(j % IDLE_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % IDLE_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } + return; + } +#ifdef CONFIG_NUMA + if (!(j % BUSY_NODE_REBALANCE_TICK)) + balance_node(this_rq, idle, this_cpu); +#endif + if (!(j % BUSY_REBALANCE_TICK)) { + spin_lock(&this_rq->lock); + load_balance(this_rq, idle, cpu_to_node_mask(this_cpu)); + spin_unlock(&this_rq->lock); + } +} +#else +/* + * on UP we do not need to balance between CPUs: + */ +static inline void rebalance_tick(runqueue_t *this_rq, int idle) +{ +} +#endif + +DEFINE_PER_CPU(struct kernel_stat, kstat) = { { 0 } }; + +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks: + */ +#define EXPIRED_STARVING(rq) \ + (STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(int user_ticks, int sys_ticks) +{ + int cpu = smp_processor_id(); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + runqueue_t *rq = this_rq(); + task_t *p = current; + + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, user_ticks); + + /* note: this timer irq context must be accounted for as well */ + if (hardirq_count() - HARDIRQ_OFFSET) { + cpustat->irq += sys_ticks; + sys_ticks = 0; + } else if (softirq_count()) { + cpustat->softirq += sys_ticks; + sys_ticks = 0; + } + + if (p == rq->idle) { + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait += sys_ticks; + else + cpustat->idle += sys_ticks; + rebalance_tick(rq, 1); + return; + } + if (TASK_NICE(p) > 0) + cpustat->nice += user_ticks; + else + cpustat->user += user_ticks; + cpustat->system += sys_ticks; + + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + set_tsk_need_resched(p); + goto out; + } + spin_lock(&rq->lock); + /* + * The task was running during this tick - update the + * time slice counter. Note: we do not update a thread's + * priority until it either goes to sleep or uses up its + * timeslice. This makes it possible for interactive tasks + * to use up their timeslices at their highest priority levels. + */ + if (unlikely(rt_task(p))) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); + } + goto out_unlock; + } + if (!--p->time_slice) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + enqueue_task(p, rq->expired); + } else + enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + * + * This only applies to tasks in the interactive + * delta range with at least TIMESLICE_GRANULARITY to requeue. + */ + if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice >= TIMESLICE_GRANULARITY(p)) && + (p->array == rq->active)) { + + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } + } +out_unlock: + spin_unlock(&rq->lock); +out: + rebalance_tick(rq, 0); +} + +void scheduling_functions_start_here(void) { } + +/* + * schedule() is the main scheduler function. + */ +asmlinkage void schedule(void) +{ + task_t *prev, *next; + runqueue_t *rq; + prio_array_t *array; + struct list_head *queue; + unsigned long long now; + unsigned long run_time; + int idx; + + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { + if (unlikely(in_atomic())) { + printk(KERN_ERR "bad: scheduling while atomic!\n"); + dump_stack(); + } + } + +need_resched: + preempt_disable(); + prev = current; + rq = this_rq(); + + release_kernel_lock(prev); + now = sched_clock(); + if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) + run_time = now - prev->timestamp; + else + run_time = NS_MAX_SLEEP_AVG; + + /* + * Tasks with interactive credits get charged less run_time + * at high sleep_avg to delay them losing their interactive + * status + */ + if (HIGH_CREDIT(prev)) + run_time /= (CURRENT_BONUS(prev) ? : 1); + + spin_lock_irq(&rq->lock); + + /* + * if entering off of a kernel preemption go straight + * to picking the next task. + */ + if (unlikely(preempt_count() & PREEMPT_ACTIVE)) + goto pick_next_task; + + switch (prev->state) { + case TASK_INTERRUPTIBLE: + if (unlikely(signal_pending(prev))) { + prev->state = TASK_RUNNING; + break; + } + default: + deactivate_task(prev, rq); + prev->nvcsw++; + break; + case TASK_RUNNING: + prev->nivcsw++; + } +pick_next_task: + if (unlikely(!rq->nr_running)) { +#ifdef CONFIG_SMP + load_balance(rq, 1, cpu_to_node_mask(smp_processor_id())); + if (rq->nr_running) + goto pick_next_task; +#endif + next = rq->idle; + rq->expired_timestamp = 0; + goto switch_tasks; + } + + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + + if (next->activated > 0) { + unsigned long long delta = now - next->timestamp; + + if (next->activated == 1) + delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; + + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->activated = 0; +switch_tasks: + prefetch(next); + clear_tsk_need_resched(prev); + RCU_qsctr(task_cpu(prev))++; + + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0){ + prev->sleep_avg = 0; + if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) + prev->interactive_credit--; + } + prev->timestamp = now; + + if (likely(prev != next)) { + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + + prepare_arch_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); + + finish_task_switch(prev); + } else + spin_unlock_irq(&rq->lock); + + reacquire_kernel_lock(current); + preempt_enable_no_resched(); + if (test_thread_flag(TIF_NEED_RESCHED)) + goto need_resched; +} + +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_PREEMPT +/* + * this is is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); + + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (unlikely(ti->preempt_count || irqs_disabled())) + return; + +need_resched: + ti->preempt_count = PREEMPT_ACTIVE; + schedule(); + ti->preempt_count = 0; + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; +} + +EXPORT_SYMBOL(preempt_schedule); +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync) +{ + task_t *p = curr->task; + return try_to_wake_up(p, mode, sync, 0); +} + +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, &q->task_list) { + wait_queue_t *curr; + unsigned flags; + curr = list_entry(tmp, wait_queue_t, task_list); + flags = curr->flags; + if (curr->func(curr, mode, sync) && + (flags & WQ_FLAG_EXCLUSIVE) && + !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0); +} + +/** + * __wake_up - sync- wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + unsigned long flags; + + if (unlikely(!q)) + return; + + spin_lock_irqsave(&q->lock, flags); + if (likely(nr_exclusive)) + __wake_up_common(q, mode, nr_exclusive, 1); + else + __wake_up_common(q, mode, nr_exclusive, 0); + spin_unlock_irqrestore(&q->lock, flags); +} + +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} + +EXPORT_SYMBOL(complete); + +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, 0); + spin_unlock_irqrestore(&x->wait.lock, flags); +} + +void wait_for_completion(struct completion *x) +{ + might_sleep(); + spin_lock_irq(&x->wait.lock); + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + wait.flags |= WQ_FLAG_EXCLUSIVE; + __add_wait_queue_tail(&x->wait, &wait); + do { + __set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&x->wait.lock); + schedule(); + spin_lock_irq(&x->wait.lock); + } while (!x->done); + __remove_wait_queue(&x->wait, &wait); + } + x->done--; + spin_unlock_irq(&x->wait.lock); +} + +EXPORT_SYMBOL(wait_for_completion); + +#define SLEEP_ON_VAR \ + unsigned long flags; \ + wait_queue_t wait; \ + init_waitqueue_entry(&wait, current); + +#define SLEEP_ON_HEAD \ + spin_lock_irqsave(&q->lock,flags); \ + __add_wait_queue(q, &wait); \ + spin_unlock(&q->lock); + +#define SLEEP_ON_TAIL \ + spin_lock_irq(&q->lock); \ + __remove_wait_queue(q, &wait); \ + spin_unlock_irqrestore(&q->lock, flags); + +void interruptible_sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(interruptible_sleep_on); + +long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_INTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void sleep_on(wait_queue_head_t *q) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + schedule(); + SLEEP_ON_TAIL +} + +EXPORT_SYMBOL(sleep_on); + +long sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + SLEEP_ON_VAR + + current->state = TASK_UNINTERRUPTIBLE; + + SLEEP_ON_HEAD + timeout = schedule_timeout(timeout); + SLEEP_ON_TAIL + + return timeout; +} + +EXPORT_SYMBOL(sleep_on_timeout); + +void scheduling_functions_end_here(void) { } + +void set_user_nice(task_t *p, long nice) +{ + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + int old_prio, new_prio, delta; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * not SCHED_NORMAL: + */ + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + + old_prio = p->prio; + new_prio = NICE_TO_PRIO(nice); + delta = new_prio - old_prio; + p->static_prio = NICE_TO_PRIO(nice); + p->prio += delta; + + if (array) { + enqueue_task(p, array); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, &flags); +} + +EXPORT_SYMBOL(set_user_nice); + +#ifndef __alpha__ + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +asmlinkage long sys_nice(int increment) +{ + int retval; + long nice; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < 0) { + if (!capable(CAP_SYS_NICE)) + return -EPERM; + if (increment < -40) + increment = -40; + } + if (increment > 40) + increment = 40; + + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(task_t *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(task_t *p) +{ + return TASK_NICE(p); +} + +EXPORT_SYMBOL(task_nice); + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +int task_curr(task_t *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +EXPORT_SYMBOL_GPL(idle_cpu); + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; +} + +/* + * setscheduler - change the scheduling policy and/or RT priority of a thread. + */ +static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + int oldprio; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + retval = -EFAULT; + if (copy_from_user(&lp, param, sizeof(struct sched_param))) + goto out_nounlock; + + /* + * We play safe to avoid deadlocks. + */ + read_lock_irq(&tasklist_lock); + + p = find_process_by_pid(pid); + + retval = -ESRCH; + if (!p) + goto out_unlock_tasklist; + + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + + if (policy < 0) + policy = p->policy; + else { + retval = -EINVAL; + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL) + goto out_unlock; + } + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. + */ + retval = -EINVAL; + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) + goto out_unlock; + if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) + goto out_unlock; + + retval = -EPERM; + if ((policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = security_task_setscheduler(p, policy, &lp); + if (retval) + goto out_unlock; + + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); + retval = 0; + p->policy = policy; + p->rt_priority = lp.sched_priority; + oldprio = p->prio; + if (policy != SCHED_NORMAL) + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else + p->prio = p->static_prio; + if (array) { + __activate_task(p, task_rq(p)); + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (rq->curr == p) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else if (p->prio < rq->curr->prio) + resched_task(rq->curr); + } + +out_unlock: + task_rq_unlock(rq, &flags); +out_unlock_tasklist: + read_unlock_irq(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, + struct sched_param __user *param) +{ + return setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) +{ + return setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +asmlinkage long sys_sched_getscheduler(pid_t pid) +{ + int retval = -EINVAL; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy; + } + read_unlock(&tasklist_lock); + +out_nounlock: + return retval; +} + +/** + * sys_sched_getscheduler - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) +{ + struct sched_param lp; + int retval = -EINVAL; + task_t *p; + + if (!param || pid < 0) + goto out_nounlock; + + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + read_unlock(&tasklist_lock); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + +out_nounlock: + return retval; + +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + cpumask_t new_mask; + int retval; + task_t *p; + + if (len < sizeof(new_mask)) + return -EINVAL; + + if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) + return -EFAULT; + + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + + /* + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. + */ + get_task_struct(p); + read_unlock(&tasklist_lock); + + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; + + retval = set_cpus_allowed(p, new_mask); + +out_unlock: + put_task_struct(p); + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long __user *user_mask_ptr) +{ + unsigned int real_len; + cpumask_t mask; + int retval; + task_t *p; + + real_len = sizeof(mask); + if (len < real_len) + return -EINVAL; + + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + cpus_and(mask, p->cpus_allowed, cpu_online_map); + +out_unlock: + read_unlock(&tasklist_lock); + if (retval) + return retval; + if (copy_to_user(user_mask_ptr, &mask, real_len)) + return -EFAULT; + return real_len; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * this function yields the current CPU by moving the calling thread + * to the expired array. If there are no other threads running on this + * CPU then this function will return. + */ +asmlinkage long sys_sched_yield(void) +{ + runqueue_t *rq = this_rq_lock(); + prio_array_t *array = current->array; + + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (likely(!rt_task(current))) { + dequeue_task(current, array); + enqueue_task(current, rq->expired); + } else { + list_del(¤t->run_list); + list_add_tail(¤t->run_list, array->queue + current->prio); + } + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt: + */ + _raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +void __cond_resched(void) +{ + set_current_state(TASK_RUNNING); + schedule(); +} + +EXPORT_SYMBOL(__cond_resched); + +/** + * yield - yield the current processor to other threads. + * + * this is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} + +EXPORT_SYMBOL(yield); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + * + * But don't do that if it is a deliberate, throttling IO wait (this task + * has set its backing_dev_info: the queue against which it should throttle) + */ +void io_schedule(void) +{ + struct runqueue *rq = this_rq(); + + atomic_inc(&rq->nr_iowait); + schedule(); + atomic_dec(&rq->nr_iowait); +} + +EXPORT_SYMBOL(io_schedule); + +long io_schedule_timeout(long timeout) +{ + struct runqueue *rq = this_rq(); + long ret; + + atomic_inc(&rq->nr_iowait); + ret = schedule_timeout(timeout); + atomic_dec(&rq->nr_iowait); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_max(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +asmlinkage long sys_sched_get_priority_min(int policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) +{ + int retval = -EINVAL; + struct timespec t; + task_t *p; + + if (pid < 0) + goto out_nounlock; + + retval = -ESRCH; + read_lock(&tasklist_lock); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); + read_unlock(&tasklist_lock); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; +out_nounlock: + return retval; +out_unlock: + read_unlock(&tasklist_lock); + return retval; +} + +static inline struct task_struct *eldest_child(struct task_struct *p) +{ + if (list_empty(&p->children)) return NULL; + return list_entry(p->children.next,struct task_struct,sibling); +} + +static inline struct task_struct *older_sibling(struct task_struct *p) +{ + if (p->sibling.prev==&p->parent->children) return NULL; + return list_entry(p->sibling.prev,struct task_struct,sibling); +} + +static inline struct task_struct *younger_sibling(struct task_struct *p) +{ + if (p->sibling.next==&p->parent->children) return NULL; + return list_entry(p->sibling.next,struct task_struct,sibling); +} + +static void show_task(task_t * p) +{ + unsigned long free = 0; + task_t *relative; + int state; + static const char * stat_nam[] = { "R", "S", "D", "T", "Z", "W" }; + + printk("%-13.13s ", p->comm); + state = p->state ? __ffs(p->state) + 1 : 0; + if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) + printk(stat_nam[state]); + else + printk(" "); +#if (BITS_PER_LONG == 32) + if (p == current) + printk(" current "); + else + printk(" %08lX ", thread_saved_pc(p)); +#else + if (p == current) + printk(" current task "); + else + printk(" %016lx ", thread_saved_pc(p)); +#endif + { + unsigned long * n = (unsigned long *) (p->thread_info+1); + while (!*n) + n++; + free = (unsigned long) n - (unsigned long)(p->thread_info+1); + } + printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); + if ((relative = eldest_child(p))) + printk("%5d ", relative->pid); + else + printk(" "); + if ((relative = younger_sibling(p))) + printk("%7d", relative->pid); + else + printk(" "); + if ((relative = older_sibling(p))) + printk(" %5d", relative->pid); + else + printk(" "); + if (!p->mm) + printk(" (L-TLB)\n"); + else + printk(" (NOTLB)\n"); + + show_stack(p, NULL); +} + +void show_state(void) +{ + task_t *g, *p; + +#if (BITS_PER_LONG == 32) + printk("\n" + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); +#else + printk("\n" + " free sibling\n"); + printk(" task PC stack pid father child younger older\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take alot of time: + */ + touch_nmi_watchdog(); + show_task(p); + } while_each_thread(g, p); + + read_unlock(&tasklist_lock); +} + +void __init init_idle(task_t *idle, int cpu) +{ + runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); + unsigned long flags; + + local_irq_save(flags); + double_rq_lock(idle_rq, rq); + + idle_rq->curr = idle_rq->idle = idle; + deactivate_task(idle, rq); + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + set_task_cpu(idle, cpu); + double_rq_unlock(idle_rq, rq); + set_tsk_need_resched(idle); + local_irq_restore(flags); + + /* Set the preempt count _outside_ the spinlocks! */ +#ifdef CONFIG_PREEMPT + idle->thread_info->preempt_count = (idle->lock_depth >= 0); +#else + idle->thread_info->preempt_count = 0; +#endif +} + +#ifdef CONFIG_SMP +/* + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +typedef struct { + struct list_head list; + task_t *task; + struct completion done; +} migration_req_t; + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed(task_t *p, cpumask_t new_mask) +{ + unsigned long flags; + migration_req_t req; + runqueue_t *rq; + + if (any_online_cpu(new_mask) == NR_CPUS) + return -EINVAL; + + rq = task_rq_lock(p, &flags); + p->cpus_allowed = new_mask; + /* + * Can the task run on the task's current CPU? If not then + * migrate the thread off to a proper CPU. + */ + if (cpu_isset(task_cpu(p), new_mask)) { + task_rq_unlock(rq, &flags); + return 0; + } + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->array && !task_running(rq, p)) { + set_task_cpu(p, any_online_cpu(p->cpus_allowed)); + task_rq_unlock(rq, &flags); + return 0; + } + init_completion(&req.done); + req.task = p; + list_add(&req.list, &rq->migration_queue); + task_rq_unlock(rq, &flags); + + wake_up_process(rq->migration_thread); + + wait_for_completion(&req.done); + return 0; +} + +EXPORT_SYMBOL_GPL(set_cpus_allowed); + +/* Move (not current) task off this cpu, onto dest cpu. */ +static void move_task_away(struct task_struct *p, int dest_cpu) +{ + runqueue_t *rq_dest; + unsigned long flags; + + rq_dest = cpu_rq(dest_cpu); + + local_irq_save(flags); + double_rq_lock(this_rq(), rq_dest); + if (task_cpu(p) != smp_processor_id()) + goto out; /* Already moved */ + + set_task_cpu(p, dest_cpu); + if (p->array) { + deactivate_task(p, this_rq()); + activate_task(p, rq_dest); + if (p->prio < rq_dest->curr->prio) + resched_task(rq_dest->curr); + } + out: + double_rq_unlock(this_rq(), rq_dest); + local_irq_restore(flags); +} + +typedef struct { + int cpu; + struct completion startup_done; + task_t *task; +} migration_startup_t; + +/* + * migration_thread - this is a highprio system thread that performs + * thread migration by bumping thread off CPU then 'pushing' onto + * another runqueue. + */ +static int migration_thread(void * data) +{ + /* Marking "param" __user is ok, since we do a set_fs(KERNEL_DS); */ + struct sched_param __user param = { .sched_priority = MAX_RT_PRIO-1 }; + migration_startup_t *startup = data; + int cpu = startup->cpu; + runqueue_t *rq; + int ret; + + startup->task = current; + complete(&startup->startup_done); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + + BUG_ON(smp_processor_id() != cpu); + + daemonize("migration/%d", cpu); + set_fs(KERNEL_DS); + + ret = setscheduler(0, SCHED_FIFO, ¶m); + + rq = this_rq(); + rq->migration_thread = current; + + for (;;) { + struct list_head *head; + migration_req_t *req; + + if (current->flags & PF_FREEZE) + refrigerator(PF_IOTHREAD); + + spin_lock_irq(&rq->lock); + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; + if (list_empty(head)) { + spin_unlock_irq(&rq->lock); + schedule(); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + spin_unlock_irq(&rq->lock); + + move_task_away(req->task, + any_online_cpu(req->task->cpus_allowed)); + complete(&req->done); + } +} + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int migration_call(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + long cpu = (long) hcpu; + migration_startup_t startup; + + switch (action) { + case CPU_ONLINE: + + printk("Starting migration thread for cpu %li\n", cpu); + + startup.cpu = cpu; + startup.task = NULL; + init_completion(&startup.startup_done); + + kernel_thread(migration_thread, &startup, CLONE_KERNEL); + wait_for_completion(&startup.startup_done); + wait_task_inactive(startup.task); + + startup.task->thread_info->cpu = cpu; + startup.task->cpus_allowed = cpumask_of_cpu(cpu); + + wake_up_process(startup.task); + + while (!cpu_rq(cpu)->migration_thread) + yield(); + + break; + } + return NOTIFY_OK; +} + +static struct notifier_block migration_notifier = { &migration_call, NULL, 0 }; + +__init int migration_init(void) +{ + /* Start one for boot CPU. */ + migration_call(&migration_notifier, CPU_ONLINE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&migration_notifier); + return 0; +} + +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) +/* + * The 'big kernel lock' + * + * This spinlock is taken and released recursively by lock_kernel() + * and unlock_kernel(). It is transparently dropped and reaquired + * over schedule(). It is used to protect legacy code that hasn't + * been migrated to a proper locking design yet. + * + * Don't use in new code. + */ +spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; + +EXPORT_SYMBOL(kernel_flag); +#endif + +static void kstat_init_cpu(int cpu) +{ + /* Add any initialisation to kstat here */ + /* Useful when cpu offlining logic is added.. */ +} + +static int __devinit kstat_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + switch(action) { + case CPU_UP_PREPARE: + kstat_init_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata kstat_nb = { + .notifier_call = kstat_cpu_notify, + .next = NULL, +}; + +__init static void init_kstat(void) { + kstat_cpu_notify(&kstat_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&kstat_nb); +} + +void __init sched_init(void) +{ + runqueue_t *rq; + int i, j, k; + + /* Init the kstat counters */ + init_kstat(); + for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; + + rq = cpu_rq(i); + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + spin_lock_init(&rq->lock); + INIT_LIST_HEAD(&rq->migration_queue); + atomic_set(&rq->nr_iowait, 0); + nr_running_init(rq); + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } + /* + * We have to do a little magic to get the first + * thread right in SMP mode. + */ + rq = this_rq(); + rq->curr = current; + rq->idle = current; + set_task_cpu(current, smp_processor_id()); + wake_up_forked_process(current); + + init_timers(); + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); +} + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if (in_atomic() || irqs_disabled()) { + if (time_before(jiffies, prev_jiffy + HZ)) + return; + prev_jiffy = jiffies; + printk(KERN_ERR "Debug: sleeping function called from invalid" + " context at %s:%d\n", file, line); + printk("in_atomic():%d, irqs_disabled():%d\n", + in_atomic(), irqs_disabled()); + dump_stack(); + } +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + + +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) +/* + * This could be a long-held lock. If another CPU holds it for a long time, + * and that CPU is not asked to reschedule then *this* CPU will spin on the + * lock for a long time, even if *this* CPU is asked to reschedule. + * + * So what we do here, in the slow (contended) path is to spin on the lock by + * hand while permitting preemption. + * + * Called inside preempt_disable(). + */ +void __preempt_spin_lock(spinlock_t *lock) +{ + if (preempt_count() > 1) { + _raw_spin_lock(lock); + return; + } + do { + preempt_enable(); + while (spin_is_locked(lock)) + cpu_relax(); + preempt_disable(); + } while (!_raw_spin_trylock(lock)); +} + +EXPORT_SYMBOL(__preempt_spin_lock); + +void __preempt_write_lock(rwlock_t *lock) +{ + if (preempt_count() > 1) { + _raw_write_lock(lock); + return; + } + + do { + preempt_enable(); + while (rwlock_is_locked(lock)) + cpu_relax(); + preempt_disable(); + } while (!_raw_write_trylock(lock)); +} + +EXPORT_SYMBOL(__preempt_write_lock); +#endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */ diff -Nru a/lib/div64.c b/lib/div64.c --- a/lib/div64.c Fri Oct 31 14:10:53 2003 +++ b/lib/div64.c Fri Oct 31 14:10:53 2003 @@ -25,25 +25,34 @@ uint32_t __div64_32(uint64_t *n, uint32_t base) { - uint32_t low, low2, high, rem; + uint64_t rem = *n; + uint64_t b = base; + uint64_t res, d = 1; + uint32_t high = rem >> 32; - low = *n & 0xffffffff; - high = *n >> 32; - rem = high % (uint32_t)base; - high = high / (uint32_t)base; - low2 = low >> 16; - low2 += rem << 16; - rem = low2 % (uint32_t)base; - low2 = low2 / (uint32_t)base; - low = low & 0xffff; - low += rem << 16; - rem = low % (uint32_t)base; - low = low / (uint32_t)base; + /* Reduce the thing a bit first */ + res = 0; + if (high >= base) { + high /= base; + res = (uint64_t) high << 32; + rem -= (uint64_t) (high*base) << 32; + } - *n = low + - ((uint64_t)low2 << 16) + - ((uint64_t)high << 32); + while ((int64_t)b > 0 && b < rem) { + b = b+b; + d = d+d; + } + do { + if (rem >= b) { + rem -= b; + res += d; + } + b >>= 1; + d >>= 1; + } while (d); + + *n = res; return rem; } diff -Nru a/mm/Makefile b/mm/Makefile --- a/mm/Makefile Fri Oct 31 14:10:54 2003 +++ b/mm/Makefile Fri Oct 31 14:10:54 2003 @@ -12,3 +12,4 @@ slab.o swap.o truncate.o vmscan.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_PROC_MM) += proc_mm.o diff -Nru a/mm/Makefile~uml-summa.diff b/mm/Makefile~uml-summa.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/Makefile~uml-summa.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,14 @@ +# +# Makefile for the linux memory manager. +# + +mmu-y := nommu.o +mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ + mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ + shmem.o vmalloc.o + +obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ + page_alloc.o page-writeback.o pdflush.o readahead.o \ + slab.o swap.o truncate.o vmscan.o $(mmu-y) + +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o diff -Nru a/mm/filemap.c b/mm/filemap.c --- a/mm/filemap.c Fri Oct 31 14:10:53 2003 +++ b/mm/filemap.c Fri Oct 31 14:10:53 2003 @@ -82,6 +82,9 @@ * ->private_lock (try_to_unmap_one) * ->page_lock (try_to_unmap_one) * ->zone.lru_lock (follow_page->mark_page_accessed) + * + * ->task->proc_lock + * ->dcache_lock (proc_pid_lookup) */ /* @@ -112,6 +115,7 @@ __remove_from_page_cache(page); spin_unlock(&mapping->page_lock); } +EXPORT_SYMBOL(remove_from_page_cache); static inline int sync_page(struct page *page) { diff -Nru a/mm/filemap.c~export-remove_from_page_cache.diff b/mm/filemap.c~export-remove_from_page_cache.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/filemap.c~export-remove_from_page_cache.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,1980 @@ +/* + * linux/mm/filemap.c + * + * Copyright (C) 1994-1999 Linus Torvalds + */ + +/* + * This file handles the generic file mmap semantics used by + * most "normal" filesystems (but you don't /have/ to use this: + * the NFS filesystem used to do this differently, for example) + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/aio.h> +#include <linux/kernel_stat.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/hash.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include <linux/blkdev.h> +#include <linux/security.h> +/* + * This is needed for the following functions: + * - try_to_release_page + * - block_invalidatepage + * - generic_osync_inode + * + * FIXME: remove all knowledge of the buffer layer from the core VM + */ +#include <linux/buffer_head.h> /* for generic_osync_inode */ + +#include <asm/uaccess.h> +#include <asm/mman.h> + +/* + * Shared mappings implemented 30.11.1994. It's not fully working yet, + * though. + * + * Shared mappings now work. 15.8.1995 Bruno. + * + * finished 'unifying' the page and buffer cache and SMP-threaded the + * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com> + * + * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de> + */ + +/* + * Lock ordering: + * + * ->i_shared_sem (vmtruncate) + * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->swap_list_lock + * ->swap_device_lock (exclusive_swap_page, others) + * ->mapping->page_lock + * + * ->mmap_sem + * ->i_shared_sem (various places) + * + * ->mmap_sem + * ->lock_page (access_process_vm) + * + * ->mmap_sem + * ->i_sem (msync) + * + * ->inode_lock + * ->sb_lock (fs/fs-writeback.c) + * ->mapping->page_lock (__sync_single_inode) + * + * ->page_table_lock + * ->swap_device_lock (try_to_unmap_one) + * ->private_lock (try_to_unmap_one) + * ->page_lock (try_to_unmap_one) + * ->zone.lru_lock (follow_page->mark_page_accessed) + */ + +/* + * Remove a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold a write_lock on the mapping's page_lock. + */ +void __remove_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + + radix_tree_delete(&mapping->page_tree, page->index); + list_del(&page->list); + page->mapping = NULL; + + mapping->nrpages--; + pagecache_acct(-1); +} + +void remove_from_page_cache(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (unlikely(!PageLocked(page))) + PAGE_BUG(page); + + spin_lock(&mapping->page_lock); + __remove_from_page_cache(page); + spin_unlock(&mapping->page_lock); +} + +static inline int sync_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + + if (mapping && mapping->a_ops && mapping->a_ops->sync_page) + return mapping->a_ops->sync_page(page); + return 0; +} + +/** + * filemap_fdatawrite - start writeback against all of a mapping's dirty pages + * @mapping: address space structure to write + * + * This is a "data integrity" operation, as opposed to a regular memory + * cleansing writeback. The difference between these two operations is that + * if a dirty page/buffer is encountered, it must be waited upon, and not just + * skipped over. + */ +static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode) +{ + int ret; + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = mapping->nrpages * 2, + }; + + if (mapping->backing_dev_info->memory_backed) + return 0; + + spin_lock(&mapping->page_lock); + list_splice_init(&mapping->dirty_pages, &mapping->io_pages); + spin_unlock(&mapping->page_lock); + ret = do_writepages(mapping, &wbc); + return ret; +} + +int filemap_fdatawrite(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_ALL); +} + +EXPORT_SYMBOL(filemap_fdatawrite); + +/* + * This is a mostly non-blocking flush. Not suitable for data-integrity + * purposes. + */ +int filemap_flush(struct address_space *mapping) +{ + return __filemap_fdatawrite(mapping, WB_SYNC_NONE); +} + +/** + * filemap_fdatawait - walk the list of locked pages of the given address + * space and wait for all of them. + * @mapping: address space structure to wait for + */ +int filemap_fdatawait(struct address_space * mapping) +{ + int ret = 0; + int progress; + +restart: + progress = 0; + spin_lock(&mapping->page_lock); + while (!list_empty(&mapping->locked_pages)) { + struct page *page; + + page = list_entry(mapping->locked_pages.next,struct page,list); + list_del(&page->list); + if (PageDirty(page)) + list_add(&page->list, &mapping->dirty_pages); + else + list_add(&page->list, &mapping->clean_pages); + + if (!PageWriteback(page)) { + if (++progress > 32) { + if (need_resched()) { + spin_unlock(&mapping->page_lock); + __cond_resched(); + goto restart; + } + } + continue; + } + + progress = 0; + page_cache_get(page); + spin_unlock(&mapping->page_lock); + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + + page_cache_release(page); + spin_lock(&mapping->page_lock); + } + spin_unlock(&mapping->page_lock); + + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + + return ret; +} + +EXPORT_SYMBOL(filemap_fdatawait); + +/* + * This adds a page to the page cache, starting out as locked, unreferenced, + * not uptodate and with no errors. + * + * This function is used for two things: adding newly allocated pagecache + * pages and for moving existing anon pages into swapcache. + * + * In the case of pagecache pages, the page is new, so we can just run + * SetPageLocked() against it. The other page state flags were set by + * rmqueue() + * + * In the case of swapcache, try_to_swap_out() has already locked the page, so + * SetPageLocked() is ugly-but-OK there too. The required page state has been + * set up by swap_out_add_to_swap_cache(). + * + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t offset, int gfp_mask) +{ + int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + + if (error == 0) { + page_cache_get(page); + spin_lock(&mapping->page_lock); + error = radix_tree_insert(&mapping->page_tree, offset, page); + if (!error) { + SetPageLocked(page); + ___add_to_page_cache(page, mapping, offset); + } else { + page_cache_release(page); + } + spin_unlock(&mapping->page_lock); + radix_tree_preload_end(); + } + return error; +} + +EXPORT_SYMBOL(add_to_page_cache); + +int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t offset, int gfp_mask) +{ + int ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) + lru_cache_add(page); + return ret; +} + +/* + * In order to wait for pages to become available there must be + * waitqueues associated with pages. By using a hash table of + * waitqueues where the bucket discipline is to maintain all + * waiters on the same queue and wake all when any of the pages + * become available, and for the woken contexts to check to be + * sure the appropriate page became available, this saves space + * at a cost of "thundering herd" phenomena during rare hash + * collisions. + */ +static wait_queue_head_t *page_waitqueue(struct page *page) +{ + const struct zone *zone = page_zone(page); + + return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; +} + +void wait_on_page_bit(struct page *page, int bit_nr) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + DEFINE_WAIT(wait); + + do { + prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE); + if (test_bit(bit_nr, &page->flags)) { + sync_page(page); + io_schedule(); + } + } while (test_bit(bit_nr, &page->flags)); + finish_wait(waitqueue, &wait); +} + +EXPORT_SYMBOL(wait_on_page_bit); + +/** + * unlock_page() - unlock a locked page + * + * @page: the page + * + * Unlocks the page and wakes up sleepers in ___wait_on_page_locked(). + * Also wakes sleepers in wait_on_page_writeback() because the wakeup + * mechananism between PageLocked pages and PageWriteback pages is shared. + * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. + * + * The first mb is necessary to safely close the critical section opened by the + * TestSetPageLocked(), the second mb is necessary to enforce ordering between + * the clear_bit and the read of the waitqueue (to avoid SMP races with a + * parallel wait_on_page_locked()). + */ +void unlock_page(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + smp_mb__before_clear_bit(); + if (!TestClearPageLocked(page)) + BUG(); + smp_mb__after_clear_bit(); + if (waitqueue_active(waitqueue)) + wake_up_all(waitqueue); +} + +EXPORT_SYMBOL(unlock_page); +EXPORT_SYMBOL(lock_page); + +/* + * End writeback against a page. + */ +void end_page_writeback(struct page *page) +{ + wait_queue_head_t *waitqueue = page_waitqueue(page); + + if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { + smp_mb__before_clear_bit(); + if (!TestClearPageWriteback(page)) + BUG(); + smp_mb__after_clear_bit(); + } + if (waitqueue_active(waitqueue)) + wake_up_all(waitqueue); +} + +EXPORT_SYMBOL(end_page_writeback); + +/* + * Get a lock on the page, assuming we need to sleep to get it. + * + * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some + * random driver's requestfn sets TASK_RUNNING, we could busywait. However + * chances are that on the second loop, the block layer's plug list is empty, + * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. + */ +void __lock_page(struct page *page) +{ + wait_queue_head_t *wqh = page_waitqueue(page); + DEFINE_WAIT(wait); + + while (TestSetPageLocked(page)) { + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + if (PageLocked(page)) { + sync_page(page); + io_schedule(); + } + } + finish_wait(wqh, &wait); +} + +EXPORT_SYMBOL(__lock_page); + +/* + * a rather lightweight function, finding and getting a reference to a + * hashed page atomically. + */ +struct page * find_get_page(struct address_space *mapping, unsigned long offset) +{ + struct page *page; + + /* + * We scan the hash list read-only. Addition to and removal from + * the hash-list needs a held write-lock. + */ + spin_lock(&mapping->page_lock); + page = radix_tree_lookup(&mapping->page_tree, offset); + if (page) + page_cache_get(page); + spin_unlock(&mapping->page_lock); + return page; +} + +EXPORT_SYMBOL(find_get_page); + +/* + * Same as above, but trylock it instead of incrementing the count. + */ +struct page *find_trylock_page(struct address_space *mapping, unsigned long offset) +{ + struct page *page; + + spin_lock(&mapping->page_lock); + page = radix_tree_lookup(&mapping->page_tree, offset); + if (page && TestSetPageLocked(page)) + page = NULL; + spin_unlock(&mapping->page_lock); + return page; +} + +EXPORT_SYMBOL(find_trylock_page); + +/** + * find_lock_page - locate, pin and lock a pagecache page + * + * @mapping - the address_space to search + * @offset - the page index + * + * Locates the desired pagecache page, locks it, increments its reference + * count and returns its address. + * + * Returns zero if the page was not present. find_lock_page() may sleep. + */ +struct page *find_lock_page(struct address_space *mapping, + unsigned long offset) +{ + struct page *page; + + spin_lock(&mapping->page_lock); +repeat: + page = radix_tree_lookup(&mapping->page_tree, offset); + if (page) { + page_cache_get(page); + if (TestSetPageLocked(page)) { + spin_unlock(&mapping->page_lock); + lock_page(page); + spin_lock(&mapping->page_lock); + + /* Has the page been truncated while we slept? */ + if (page->mapping != mapping || page->index != offset) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + } + } + spin_unlock(&mapping->page_lock); + return page; +} + +EXPORT_SYMBOL(find_lock_page); + +/** + * find_or_create_page - locate or add a pagecache page + * + * @mapping - the page's address_space + * @index - the page's index into the mapping + * @gfp_mask - page allocation mode + * + * Locates a page in the pagecache. If the page is not present, a new page + * is allocated using @gfp_mask and is added to the pagecache and to the VM's + * LRU list. The returned page is locked and has its reference count + * incremented. + * + * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic + * allocation! + * + * find_or_create_page() returns the desired page's address, or zero on + * memory exhaustion. + */ +struct page *find_or_create_page(struct address_space *mapping, + unsigned long index, unsigned int gfp_mask) +{ + struct page *page, *cached_page = NULL; + int err; +repeat: + page = find_lock_page(mapping, index); + if (!page) { + if (!cached_page) { + cached_page = alloc_page(gfp_mask); + if (!cached_page) + return NULL; + } + err = add_to_page_cache_lru(cached_page, mapping, + index, gfp_mask); + if (!err) { + page = cached_page; + cached_page = NULL; + } else if (err == -EEXIST) + goto repeat; + } + if (cached_page) + page_cache_release(cached_page); + return page; +} + +EXPORT_SYMBOL(find_or_create_page); + +/** + * find_get_pages - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page index + * @nr_pages: The maximum number of pages + * @pages: Where the resulting pages are placed + * + * find_get_pages() will search for and return a group of up to + * @nr_pages pages in the mapping. The pages are placed at @pages. + * find_get_pages() takes a reference against the returned pages. + * + * The search returns a group of mapping-contiguous pages with ascending + * indexes. There may be holes in the indices due to not-present pages. + * + * find_get_pages() returns the number of pages which were found. + */ +unsigned int find_get_pages(struct address_space *mapping, pgoff_t start, + unsigned int nr_pages, struct page **pages) +{ + unsigned int i; + unsigned int ret; + + spin_lock(&mapping->page_lock); + ret = radix_tree_gang_lookup(&mapping->page_tree, + (void **)pages, start, nr_pages); + for (i = 0; i < ret; i++) + page_cache_get(pages[i]); + spin_unlock(&mapping->page_lock); + return ret; +} + +/* + * Same as grab_cache_page, but do not wait if the page is unavailable. + * This is intended for speculative data generators, where the data can + * be regenerated if the page couldn't be grabbed. This routine should + * be safe to call while holding the lock for another page. + * + * Clear __GFP_FS when allocating the page to avoid recursion into the fs + * and deadlock against the caller's locked page. + */ +struct page * +grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +{ + struct page *page = find_get_page(mapping, index); + int gfp_mask; + + if (page) { + if (!TestSetPageLocked(page)) + return page; + page_cache_release(page); + return NULL; + } + gfp_mask = mapping_gfp_mask(mapping) & ~__GFP_FS; + page = alloc_pages(gfp_mask, 0); + if (page && add_to_page_cache_lru(page, mapping, index, gfp_mask)) { + page_cache_release(page); + page = NULL; + } + return page; +} + +EXPORT_SYMBOL(grab_cache_page_nowait); + +/* + * This is a generic file read routine, and uses the + * inode->i_op->readpage() function for the actual low-level + * stuff. + * + * This is really ugly. But the goto's actually try to clarify some + * of the logic when it comes to error handling etc. + * - note the struct file * is only passed for the use of readpage + */ +void do_generic_mapping_read(struct address_space *mapping, + struct file_ra_state *ra, + struct file * filp, + loff_t *ppos, + read_descriptor_t * desc, + read_actor_t actor) +{ + struct inode *inode = mapping->host; + unsigned long index, offset; + struct page *cached_page; + int error; + + cached_page = NULL; + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + + for (;;) { + struct page *page; + unsigned long end_index, nr, ret; + loff_t isize = i_size_read(inode); + + end_index = isize >> PAGE_CACHE_SHIFT; + + if (index > end_index) + break; + nr = PAGE_CACHE_SIZE; + if (index == end_index) { + nr = isize & ~PAGE_CACHE_MASK; + if (nr <= offset) + break; + } + + cond_resched(); + page_cache_readahead(mapping, ra, filp, index); + + nr = nr - offset; +find_page: + page = find_get_page(mapping, index); + if (unlikely(page == NULL)) { + handle_ra_miss(mapping, ra, index); + goto no_cached_page; + } + if (!PageUptodate(page)) + goto page_not_up_to_date; +page_ok: + /* If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (!list_empty(&mapping->i_mmap_shared)) + flush_dcache_page(page); + + /* + * Mark the page accessed if we read the beginning. + */ + if (!offset) + mark_page_accessed(page); + + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + * + * The actor routine returns how many bytes were actually used.. + * NOTE! This may not be the same as how much of a user buffer + * we filled up (we may be padding etc), so we can only update + * "pos" here (the actor routine has to update the user buffer + * pointers and the remaining count). + */ + ret = actor(desc, page, offset, nr); + offset += ret; + index += offset >> PAGE_CACHE_SHIFT; + offset &= ~PAGE_CACHE_MASK; + + page_cache_release(page); + if (ret == nr && desc->count) + continue; + break; + +page_not_up_to_date: + if (PageUptodate(page)) + goto page_ok; + + /* Get exclusive access to the page ... */ + lock_page(page); + + /* Did it get unhashed before we got the lock? */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + /* Did somebody else fill it already? */ + if (PageUptodate(page)) { + unlock_page(page); + goto page_ok; + } + +readpage: + /* ... and start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (!error) { + if (PageUptodate(page)) + goto page_ok; + wait_on_page_locked(page); + if (PageUptodate(page)) + goto page_ok; + error = -EIO; + } + + /* UHHUH! A synchronous read error occurred. Report it */ + desc->error = error; + page_cache_release(page); + break; + +no_cached_page: + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + */ + if (!cached_page) { + cached_page = page_cache_alloc_cold(mapping); + if (!cached_page) { + desc->error = -ENOMEM; + break; + } + } + error = add_to_page_cache_lru(cached_page, mapping, + index, GFP_KERNEL); + if (error) { + if (error == -EEXIST) + goto find_page; + desc->error = error; + break; + } + page = cached_page; + cached_page = NULL; + goto readpage; + } + + *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; + if (cached_page) + page_cache_release(cached_page); + update_atime(inode); +} + +EXPORT_SYMBOL(do_generic_mapping_read); + +int file_read_actor(read_descriptor_t *desc, struct page *page, + unsigned long offset, unsigned long size) +{ + char *kaddr; + unsigned long left, count = desc->count; + + if (size > count) + size = count; + + /* + * Faults on the destination of a read are common, so do it before + * taking the kmap. + */ + if (!fault_in_pages_writeable(desc->buf, size)) { + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_to_user(desc->buf, kaddr + offset, size); + kunmap_atomic(kaddr, KM_USER0); + if (left == 0) + goto success; + } + + /* Do it the slow way */ + kaddr = kmap(page); + left = __copy_to_user(desc->buf, kaddr + offset, size); + kunmap(page); + + if (left) { + size -= left; + desc->error = -EFAULT; + } +success: + desc->count = count - size; + desc->written += size; + desc->buf += size; + return size; +} + +/* + * This is the "read()" routine for all filesystems + * that can use the page cache directly. + */ +ssize_t +__generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + + count = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + count += iv->iov_len; + if (unlikely((ssize_t)(count|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + nr_segs = seg; + count -= iv->iov_len; /* This segment is no good */ + break; + } + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (filp->f_flags & O_DIRECT) { + loff_t pos = *ppos, size; + struct address_space *mapping; + struct inode *inode; + + mapping = filp->f_dentry->d_inode->i_mapping; + inode = mapping->host; + retval = 0; + if (!count) + goto out; /* skip atime */ + size = i_size_read(inode); + if (pos < size) { + retval = generic_file_direct_IO(READ, iocb, + iov, pos, nr_segs); + if (retval >= 0 && !is_sync_kiocb(iocb)) + retval = -EIOCBQUEUED; + if (retval > 0) + *ppos = pos + retval; + } + update_atime(filp->f_dentry->d_inode); + goto out; + } + + retval = 0; + if (count) { + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp,ppos,&desc,file_read_actor); + retval += desc.written; + if (!retval) { + retval = desc.error; + break; + } + } + } +out: + return retval; +} + +EXPORT_SYMBOL(__generic_file_aio_read); + +ssize_t +generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos) +{ + struct iovec local_iov = { .iov_base = buf, .iov_len = count }; + + BUG_ON(iocb->ki_pos != pos); + return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos); +} + +EXPORT_SYMBOL(generic_file_aio_read); + +ssize_t +generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +{ + struct iovec local_iov = { .iov_base = buf, .iov_len = count }; + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + ret = __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + +EXPORT_SYMBOL(generic_file_read); + +int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) +{ + ssize_t written; + unsigned long count = desc->count; + struct file *file = (struct file *) desc->buf; + + if (size > count) + size = count; + + written = file->f_op->sendpage(file, page, offset, + size, &file->f_pos, size<count); + if (written < 0) { + desc->error = written; + written = 0; + } + desc->count = count - written; + desc->written += written; + return written; +} + +ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, + size_t count, read_actor_t actor, void __user *target) +{ + read_descriptor_t desc; + + if (!count) + return 0; + + desc.written = 0; + desc.count = count; + desc.buf = target; + desc.error = 0; + + do_generic_file_read(in_file, ppos, &desc, actor); + if (desc.written) + return desc.written; + return desc.error; +} + +EXPORT_SYMBOL(generic_file_sendfile); + +static ssize_t +do_readahead(struct address_space *mapping, struct file *filp, + unsigned long index, unsigned long nr) +{ + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + return -EINVAL; + + force_page_cache_readahead(mapping, filp, index, + max_sane_readahead(nr)); + return 0; +} + +asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +{ + ssize_t ret; + struct file *file; + + ret = -EBADF; + file = fget(fd); + if (file) { + if (file->f_mode & FMODE_READ) { + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + unsigned long start = offset >> PAGE_CACHE_SHIFT; + unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT; + unsigned long len = end - start + 1; + ret = do_readahead(mapping, file, start, len); + } + fput(file); + } + return ret; +} + +#ifdef CONFIG_MMU +/* + * This adds the requested page to the page cache if it isn't already there, + * and schedules an I/O to read in its contents from disk. + */ +static int FASTCALL(page_cache_read(struct file * file, unsigned long offset)); +static int page_cache_read(struct file * file, unsigned long offset) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct page *page; + int error; + + page = page_cache_alloc_cold(mapping); + if (!page) + return -ENOMEM; + + error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); + if (!error) { + error = mapping->a_ops->readpage(file, page); + page_cache_release(page); + return error; + } + + /* + * We arrive here in the unlikely event that someone + * raced with us and added our page to the cache first + * or we are out of memory for radix-tree nodes. + */ + page_cache_release(page); + return error == -EEXIST ? 0 : error; +} + +#define MMAP_READAROUND (16UL) +#define MMAP_LOTSAMISS (100) + +/* + * filemap_nopage() is invoked via the vma operations vector for a + * mapped memory region to read in file data during a page fault. + * + * The goto's are kind of ugly, but this streamlines the normal case of having + * it in the page cache, and handles the special cases reasonably without + * having a lot of duplicated code. + */ +struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused) +{ + int error; + struct file *file = area->vm_file; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct file_ra_state *ra = &file->f_ra; + struct inode *inode = mapping->host; + struct page *page; + unsigned long size, pgoff, endoff; + int did_readaround = 0; + + pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; + +retry_all: + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (pgoff >= size) + goto outside_data_content; + + /* If we don't want any read-ahead, don't bother */ + if (VM_RandomReadHint(area)) + goto no_cached_page; + + /* + * The "size" of the file, as far as mmap is concerned, isn't bigger + * than the mapping + */ + if (size > endoff) + size = endoff; + + /* + * The readahead code wants to be told about each and every page + * so it can build and shrink its windows appropriately + * + * For sequential accesses, we use the generic readahead logic. + */ + if (VM_SequentialReadHint(area)) + page_cache_readahead(mapping, ra, file, pgoff); + + /* + * Do we have something in the page cache already? + */ +retry_find: + page = find_get_page(mapping, pgoff); + if (!page) { + if (VM_SequentialReadHint(area)) { + handle_ra_miss(mapping, ra, pgoff); + goto no_cached_page; + } + ra->mmap_miss++; + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS) + goto no_cached_page; + + did_readaround = 1; + do_page_cache_readahead(mapping, file, + pgoff & ~(MMAP_READAROUND-1), MMAP_READAROUND); + goto retry_find; + } + + if (!did_readaround) + ra->mmap_hit++; + + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date. + */ + if (!PageUptodate(page)) + goto page_not_uptodate; + +success: + /* + * Found the page and have a reference on it. + */ + mark_page_accessed(page); + return page; + +outside_data_content: + /* + * An external ptracer can access pages that normally aren't + * accessible.. + */ + if (area->vm_mm == current->mm) + return NULL; + /* Fall through to the non-read-ahead case */ +no_cached_page: + /* + * We're only likely to ever get here if MADV_RANDOM is in + * effect. + */ + error = page_cache_read(file, pgoff); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + if (error == -ENOMEM) + return NOPAGE_OOM; + return NULL; + +page_not_uptodate: + inc_page_state(pgmajfault); + lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto retry_all; + } + + /* Did somebody else get it up-to-date? */ + if (PageUptodate(page)) { + unlock_page(page); + goto success; + } + + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page_locked(page); + if (PageUptodate(page)) + goto success; + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto retry_all; + } + + /* Somebody else successfully read it in? */ + if (PageUptodate(page)) { + unlock_page(page); + goto success; + } + ClearPageError(page); + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page_locked(page); + if (PageUptodate(page)) + goto success; + } + + /* + * Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ + page_cache_release(page); + return NULL; +} + +EXPORT_SYMBOL(filemap_nopage); + +static struct page * filemap_getpage(struct file *file, unsigned long pgoff, + int nonblock) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct page *page; + int error; + + /* + * Do we have something in the page cache already? + */ +retry_find: + page = find_get_page(mapping, pgoff); + if (!page) { + if (nonblock) + return NULL; + goto no_cached_page; + } + + /* + * Ok, found a page in the page cache, now we need to check + * that it's up-to-date. + */ + if (!PageUptodate(page)) + goto page_not_uptodate; + +success: + /* + * Found the page and have a reference on it. + */ + mark_page_accessed(page); + return page; + +no_cached_page: + error = page_cache_read(file, pgoff); + + /* + * The page we want has now been added to the page cache. + * In the unlikely event that someone removed it in the + * meantime, we'll just come back here and read it again. + */ + if (error >= 0) + goto retry_find; + + /* + * An error return from page_cache_read can result if the + * system is low on memory, or a problem occurs while trying + * to schedule I/O. + */ + return NULL; + +page_not_uptodate: + lock_page(page); + + /* Did it get unhashed while we waited for it? */ + if (!page->mapping) { + unlock_page(page); + goto err; + } + + /* Did somebody else get it up-to-date? */ + if (PageUptodate(page)) { + unlock_page(page); + goto success; + } + + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page_locked(page); + if (PageUptodate(page)) + goto success; + } + + /* + * Umm, take care of errors if the page isn't up-to-date. + * Try to re-read it _once_. We do this synchronously, + * because there really aren't any performance issues here + * and we need to check for errors. + */ + lock_page(page); + + /* Somebody truncated the page on us? */ + if (!page->mapping) { + unlock_page(page); + goto err; + } + /* Somebody else successfully read it in? */ + if (PageUptodate(page)) { + unlock_page(page); + goto success; + } + + ClearPageError(page); + if (!mapping->a_ops->readpage(file, page)) { + wait_on_page_locked(page); + if (PageUptodate(page)) + goto success; + } + + /* + * Things didn't work out. Return zero to tell the + * mm layer so, possibly freeing the page cache page first. + */ +err: + page_cache_release(page); + + return NULL; +} + +static int filemap_populate(struct vm_area_struct *vma, + unsigned long addr, + unsigned long len, + pgprot_t prot, + unsigned long pgoff, + int nonblock) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + unsigned long size; + struct mm_struct *mm = vma->vm_mm; + struct page *page; + int err; + + if (!nonblock) + force_page_cache_readahead(mapping, vma->vm_file, + pgoff, len >> PAGE_CACHE_SHIFT); + +repeat: + size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (pgoff + (len >> PAGE_CACHE_SHIFT) > size) + return -EINVAL; + + page = filemap_getpage(file, pgoff, nonblock); + if (!page && !nonblock) + return -ENOMEM; + if (page) { + err = install_page(mm, vma, addr, page, prot); + if (err) { + page_cache_release(page); + return err; + } + } else { + /* + * If a nonlinear mapping then store the file page offset + * in the pte. + */ + unsigned long pgidx; + pgidx = (addr - vma->vm_start) >> PAGE_SHIFT; + pgidx += vma->vm_pgoff; + pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; + if (pgoff != pgidx) { + err = install_file_pte(mm, vma, addr, pgoff, prot); + if (err) + return err; + } + } + + len -= PAGE_SIZE; + addr += PAGE_SIZE; + pgoff++; + if (len) + goto repeat; + + return 0; +} + +static struct vm_operations_struct generic_file_vm_ops = { + .nopage = filemap_nopage, + .populate = filemap_populate, +}; + +/* This is used for a general mmap of a disk file */ + +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; + + if (!mapping->a_ops->readpage) + return -ENOEXEC; + update_atime(inode); + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +/* + * This is for filesystems which do not implement ->writepage. + */ +int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + return generic_file_mmap(file, vma); +} +#else +int generic_file_mmap(struct file * file, struct vm_area_struct * vma) +{ + return -ENOSYS; +} +int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) +{ + return -ENOSYS; +} +#endif /* CONFIG_MMU */ + +EXPORT_SYMBOL(generic_file_mmap); +EXPORT_SYMBOL(generic_file_readonly_mmap); + +static inline struct page *__read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page, *cached_page = NULL; + int err; +repeat: + page = find_get_page(mapping, index); + if (!page) { + if (!cached_page) { + cached_page = page_cache_alloc_cold(mapping); + if (!cached_page) + return ERR_PTR(-ENOMEM); + } + err = add_to_page_cache_lru(cached_page, mapping, + index, GFP_KERNEL); + if (err == -EEXIST) + goto repeat; + if (err < 0) { + /* Presumably ENOMEM for radix tree node */ + page_cache_release(cached_page); + return ERR_PTR(err); + } + page = cached_page; + cached_page = NULL; + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + } + if (cached_page) + page_cache_release(cached_page); + return page; +} + +/* + * Read into the page cache. If a page already exists, + * and PageUptodate() is not set, try to fill the page. + */ +struct page *read_cache_page(struct address_space *mapping, + unsigned long index, + int (*filler)(void *,struct page*), + void *data) +{ + struct page *page; + int err; + +retry: + page = __read_cache_page(mapping, index, filler, data); + if (IS_ERR(page)) + goto out; + mark_page_accessed(page); + if (PageUptodate(page)) + goto out; + + lock_page(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto retry; + } + if (PageUptodate(page)) { + unlock_page(page); + goto out; + } + err = filler(data, page); + if (err < 0) { + page_cache_release(page); + page = ERR_PTR(err); + } + out: + return page; +} + +EXPORT_SYMBOL(read_cache_page); + +/* + * If the page was newly created, increment its refcount and add it to the + * caller's lru-buffering pagevec. This function is specifically for + * generic_file_write(). + */ +static inline struct page * +__grab_cache_page(struct address_space *mapping, unsigned long index, + struct page **cached_page, struct pagevec *lru_pvec) +{ + int err; + struct page *page; +repeat: + page = find_lock_page(mapping, index); + if (!page) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (!*cached_page) + return NULL; + } + err = add_to_page_cache(*cached_page, mapping, + index, GFP_KERNEL); + if (err == -EEXIST) + goto repeat; + if (err == 0) { + page = *cached_page; + page_cache_get(page); + if (!pagevec_add(lru_pvec, page)) + __pagevec_lru_add(lru_pvec); + *cached_page = NULL; + } + } + return page; +} + +void remove_suid(struct dentry *dentry) +{ + struct iattr newattrs; + struct inode *inode = dentry->d_inode; + unsigned int mode = inode->i_mode & (S_ISUID|S_ISGID|S_IXGRP); + + if (!(mode & S_IXGRP)) + mode &= S_ISUID; + + /* were any of the uid bits set? */ + if (mode && !capable(CAP_FSETID)) { + newattrs.ia_valid = ATTR_KILL_SUID|ATTR_KILL_SGID|ATTR_FORCE; + notify_change(dentry, &newattrs); + } +} + +EXPORT_SYMBOL(remove_suid); + +/* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then clear the page + * out to (offset+bytes) and return the number of bytes which were copied. + */ +static inline size_t +filemap_copy_from_user(struct page *page, unsigned long offset, + const char __user *buf, unsigned bytes) +{ + char *kaddr; + int left; + + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap_atomic(kaddr, KM_USER0); + + if (left != 0) { + /* Do it the slow way */ + kaddr = kmap(page); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap(page); + } + return bytes - left; +} + +static size_t +__filemap_copy_from_user_iovec(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) { + /* zero the rest of the target like __copy_from_user */ + if (bytes) + memset(vaddr, 0, bytes); + break; + } + } + return copied - left; +} + +/* + * This has the same sideeffects and return value as filemap_copy_from_user(). + * The difference is that on a fault we need to memset the remainder of the + * page (out to offset+bytes), to emulate filemap_copy_from_user()'s + * single-segment behaviour. + */ +static inline size_t +filemap_copy_from_user_iovec(struct page *page, unsigned long offset, + const struct iovec *iov, size_t base, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap_atomic(page, KM_USER0); + copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, + base, bytes); + kunmap_atomic(kaddr, KM_USER0); + if (copied != bytes) { + kaddr = kmap(page); + copied = __filemap_copy_from_user_iovec(kaddr + offset, iov, + base, bytes); + kunmap(page); + } + return copied; +} + +static inline void +filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) +{ + const struct iovec *iov = *iovp; + size_t base = *basep; + + while (bytes) { + int copy = min(bytes, iov->iov_len - base); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + *iovp = iov; + *basep = base; +} + +/* + * Performs necessary checks before doing a write + * + * Can adjust writing position aor amount of bytes to write. + * Returns appropriate error code that caller should return or + * zero in case that write should be allowed. + */ +inline int generic_write_checks(struct inode *inode, + struct file *file, loff_t *pos, size_t *count, int isblk) +{ + unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + + if (unlikely(*pos < 0)) + return -EINVAL; + + if (unlikely(file->f_error)) { + int err = file->f_error; + file->f_error = 0; + return err; + } + + if (!isblk) { + /* FIXME: this is for backwards compatibility with 2.4 */ + if (file->f_flags & O_APPEND) + *pos = i_size_read(inode); + + if (limit != RLIM_INFINITY) { + if (*pos >= limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + if (*count > limit - (typeof(limit))*pos) { + *count = limit - (typeof(limit))*pos; + } + } + } + + /* + * LFS rule + */ + if (unlikely(*pos + *count > MAX_NON_LFS && + !(file->f_flags & O_LARGEFILE))) { + if (*pos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + if (*count > MAX_NON_LFS - (unsigned long)*pos) { + *count = MAX_NON_LFS - (unsigned long)*pos; + } + } + + /* + * Are we about to exceed the fs block limit ? + * + * If we have written data it becomes a short write. If we have + * exceeded without writing data we send a signal and return EFBIG. + * Linus frestrict idea will clean these up nicely.. + */ + if (likely(!isblk)) { + if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { + if (*count || *pos > inode->i_sb->s_maxbytes) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + /* zero-length writes at ->s_maxbytes are OK */ + } + + if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) + *count = inode->i_sb->s_maxbytes - *pos; + } else { + loff_t isize; + if (bdev_read_only(inode->i_bdev)) + return -EPERM; + isize = i_size_read(inode); + if (*pos >= isize) { + if (*count || *pos > isize) + return -ENOSPC; + } + + if (*pos + *count > isize) + *count = isize - *pos; + } + return 0; +} + +EXPORT_SYMBOL(generic_write_checks); + +/* + * Write to a file through the page cache. + * + * We put everything into the page cache prior to writing it. This is not a + * problem when writing full pages. With partial pages, however, we first have + * to read the data into the cache, then dirty the page, and finally schedule + * it for writing by marking it dirty. + * okir@monad.swb.de + */ +ssize_t +generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct file *file = iocb->ki_filp; + struct address_space * mapping = file->f_dentry->d_inode->i_mapping; + struct address_space_operations *a_ops = mapping->a_ops; + size_t ocount; /* original count */ + size_t count; /* after file limit checks */ + struct inode *inode = mapping->host; + long status = 0; + loff_t pos; + struct page *page; + struct page *cached_page = NULL; + const int isblk = S_ISBLK(inode->i_mode); + ssize_t written; + ssize_t err; + size_t bytes; + struct pagevec lru_pvec; + const struct iovec *cur_iov = iov; /* current iovec */ + size_t iov_base = 0; /* offset in the current iovec */ + unsigned long seg; + char __user *buf; + + ocount = 0; + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + ocount += iv->iov_len; + if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + nr_segs = seg; + ocount -= iv->iov_len; /* This segment is no good */ + break; + } + + count = ocount; + pos = *ppos; + pagevec_init(&lru_pvec, 0); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + written = 0; + + err = generic_write_checks(inode, file, &pos, &count, isblk); + if (err) + goto out; + + + if (count == 0) + goto out; + + remove_suid(file->f_dentry); + inode_update_time(inode, 1); + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (unlikely(file->f_flags & O_DIRECT)) { + if (count != ocount) + nr_segs = iov_shorten((struct iovec *)iov, + nr_segs, count); + written = generic_file_direct_IO(WRITE, iocb, + iov, pos, nr_segs); + if (written > 0) { + loff_t end = pos + written; + if (end > i_size_read(inode) && !isblk) { + i_size_write(inode, end); + mark_inode_dirty(inode); + } + *ppos = end; + } + /* + * Sync the fs metadata but not the minor inode changes and + * of course not the data as we did direct DMA for the IO. + */ + if (written >= 0 && file->f_flags & O_SYNC) + status = generic_osync_inode(inode, OSYNC_METADATA); + if (written >= 0 && !is_sync_kiocb(iocb)) + written = -EIOCBQUEUED; + goto out_status; + } + + buf = iov->iov_base; + do { + unsigned long index; + unsigned long offset; + size_t copied; + + offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + index = pos >> PAGE_CACHE_SHIFT; + bytes = PAGE_CACHE_SIZE - offset; + if (bytes > count) + bytes = count; + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + */ + fault_in_pages_readable(buf, bytes); + + page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + if (!page) { + status = -ENOMEM; + break; + } + + status = a_ops->prepare_write(file, page, offset, offset+bytes); + if (unlikely(status)) { + loff_t isize = i_size_read(inode); + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. + */ + unlock_page(page); + page_cache_release(page); + if (pos + bytes > isize) + vmtruncate(inode, isize); + break; + } + if (likely(nr_segs == 1)) + copied = filemap_copy_from_user(page, offset, + buf, bytes); + else + copied = filemap_copy_from_user_iovec(page, offset, + cur_iov, iov_base, bytes); + flush_dcache_page(page); + status = a_ops->commit_write(file, page, offset, offset+bytes); + if (likely(copied > 0)) { + if (!status) + status = copied; + + if (status >= 0) { + written += status; + count -= status; + pos += status; + buf += status; + if (unlikely(nr_segs > 1)) + filemap_set_next_iovec(&cur_iov, + &iov_base, status); + } + } + if (unlikely(copied != bytes)) + if (status >= 0) + status = -EFAULT; + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + if (status < 0) + break; + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (count); + *ppos = pos; + + if (cached_page) + page_cache_release(cached_page); + + /* + * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + */ + if (status >= 0) { + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) + status = generic_osync_inode(inode, + OSYNC_METADATA|OSYNC_DATA); + } + +out_status: + err = written ? written : status; +out: + pagevec_lru_add(&lru_pvec); + current->backing_dev_info = 0; + return err; +} + +EXPORT_SYMBOL(generic_file_aio_write_nolock); + +ssize_t +generic_file_write_nolock(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + ret = generic_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + +EXPORT_SYMBOL(generic_file_write_nolock); + +ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, + size_t count, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + ssize_t err; + struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; + + BUG_ON(iocb->ki_pos != pos); + + down(&inode->i_sem); + err = generic_file_aio_write_nolock(iocb, &local_iov, 1, + &iocb->ki_pos); + up(&inode->i_sem); + + return err; +} + +EXPORT_SYMBOL(generic_file_aio_write); + +ssize_t generic_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_dentry->d_inode->i_mapping->host; + ssize_t err; + struct iovec local_iov = { .iov_base = (void __user *)buf, .iov_len = count }; + + down(&inode->i_sem); + err = generic_file_write_nolock(file, &local_iov, 1, ppos); + up(&inode->i_sem); + + return err; +} + +EXPORT_SYMBOL(generic_file_write); + +ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct kiocb kiocb; + ssize_t ret; + + init_sync_kiocb(&kiocb, filp); + ret = __generic_file_aio_read(&kiocb, iov, nr_segs, ppos); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&kiocb); + return ret; +} + +EXPORT_SYMBOL(generic_file_readv); + +ssize_t generic_file_writev(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t * ppos) +{ + struct inode *inode = file->f_dentry->d_inode; + ssize_t ret; + + down(&inode->i_sem); + ret = generic_file_write_nolock(file, iov, nr_segs, ppos); + up(&inode->i_sem); + return ret; +} + +EXPORT_SYMBOL(generic_file_writev); + +ssize_t +generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + ssize_t retval; + + if (mapping->nrpages) { + retval = filemap_fdatawrite(mapping); + if (retval == 0) + retval = filemap_fdatawait(mapping); + if (retval) + goto out; + } + + retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); + if (rw == WRITE && mapping->nrpages) + invalidate_inode_pages2(mapping); +out: + return retval; +} + +EXPORT_SYMBOL_GPL(generic_file_direct_IO); diff -Nru a/mm/memory.c b/mm/memory.c --- a/mm/memory.c Fri Oct 31 14:10:53 2003 +++ b/mm/memory.c Fri Oct 31 14:10:53 2003 @@ -44,6 +44,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/rmap-locking.h> +#include <linux/init.h> #include <linux/module.h> #include <asm/pgalloc.h> @@ -680,6 +681,24 @@ } +static struct vm_area_struct fixmap_vma = { + /* Catch users - if there are any valid + ones, we can make this be "&init_mm" or + something. */ + .vm_mm = NULL, + .vm_page_prot = PAGE_READONLY, + .vm_flags = VM_READ | VM_EXEC, +}; + +static int init_fixmap_vma(void) +{ + fixmap_vma.vm_start = FIXADDR_START; + fixmap_vma.vm_end = FIXADDR_TOP; + return(0); +} + +__initcall(init_fixmap_vma); + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -700,19 +719,8 @@ vma = find_extend_vma(mm, start); -#ifdef FIXADDR_USER_START - if (!vma && - start >= FIXADDR_USER_START && start < FIXADDR_USER_END) { - static struct vm_area_struct fixmap_vma = { - /* Catch users - if there are any valid - ones, we can make this be "&init_mm" or - something. */ - .vm_mm = NULL, - .vm_start = FIXADDR_USER_START, - .vm_end = FIXADDR_USER_END, - .vm_page_prot = PAGE_READONLY, - .vm_flags = VM_READ | VM_EXEC, - }; +#ifdef FIXADDR_START + if (!vma && start >= FIXADDR_START && start < FIXADDR_TOP) { unsigned long pg = start & PAGE_MASK; pgd_t *pgd; pmd_t *pmd; diff -Nru a/mm/memory.c~init_fixmap_vma.diff b/mm/memory.c~init_fixmap_vma.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/memory.c~init_fixmap_vma.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,1703 @@ +/* + * linux/mm/memory.c + * + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +/* + * demand-loading started 01.12.91 - seems it is high on the list of + * things wanted, and it should be easy to implement. - Linus + */ + +/* + * Ok, demand-loading was easy, shared pages a little bit tricker. Shared + * pages started 02.12.91, seems to work. - Linus. + * + * Tested sharing by executing about 30 /bin/sh: under the old kernel it + * would have taken more than the 6M I have free, but it worked well as + * far as I could see. + * + * Also corrected some "invalidate()"s - I wasn't doing enough of them. + */ + +/* + * Real VM (paging to/from disk) started 18.12.91. Much more work and + * thought has to go into this. Oh, well.. + * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. + * Found it. Everything seems to work now. + * 20.12.91 - Ok, making the swap-device changeable like the root. + */ + +/* + * 05.04.94 - Multi-page memory management added for v1.1. + * Idea by Alex Bligh (alex@cconcepts.co.uk) + * + * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG + * (Gerhard.Wichert@pdb.siemens.de) + */ + +#include <linux/kernel_stat.h> +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/mman.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/rmap-locking.h> +#include <linux/module.h> + +#include <asm/pgalloc.h> +#include <asm/rmap.h> +#include <asm/uaccess.h> +#include <asm/tlb.h> +#include <asm/tlbflush.h> +#include <asm/pgtable.h> + +#include <linux/swapops.h> + +#ifndef CONFIG_DISCONTIGMEM +/* use the per-pgdat data instead for discontigmem - mbligh */ +unsigned long max_mapnr; +struct page *mem_map; + +EXPORT_SYMBOL(max_mapnr); +EXPORT_SYMBOL(mem_map); +#endif + +unsigned long num_physpages; +void * high_memory; +struct page *highmem_start_page; + +EXPORT_SYMBOL(num_physpages); +EXPORT_SYMBOL(highmem_start_page); +EXPORT_SYMBOL(high_memory); + +/* + * We special-case the C-O-W ZERO_PAGE, because it's such + * a common occurrence (no need to read the page to know + * that it's zero - better for the cache and memory subsystem). + */ +static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address) +{ + if (from == ZERO_PAGE(address)) { + clear_user_highpage(to, address); + return; + } + copy_user_highpage(to, from, address); +} + +/* + * Note: this doesn't free the actual pages themselves. That + * has been handled earlier when unmapping all the memory regions. + */ +static inline void free_one_pmd(struct mmu_gather *tlb, pmd_t * dir) +{ + struct page *page; + + if (pmd_none(*dir)) + return; + if (pmd_bad(*dir)) { + pmd_ERROR(*dir); + pmd_clear(dir); + return; + } + page = pmd_page(*dir); + pmd_clear(dir); + pgtable_remove_rmap(page); + pte_free_tlb(tlb, page); +} + +static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) +{ + int j; + pmd_t * pmd; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, 0); + pgd_clear(dir); + for (j = 0; j < PTRS_PER_PMD ; j++) + free_one_pmd(tlb, pmd+j); + pmd_free_tlb(tlb, pmd); +} + +/* + * This function clears all user-level page tables of a process - this + * is needed by execve(), so that old pages aren't in the way. + * + * Must be called with pagetable lock held. + */ +void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) +{ + pgd_t * page_dir = tlb->mm->pgd; + + page_dir += first; + do { + free_one_pgd(tlb, page_dir); + page_dir++; + } while (--nr); +} + +pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (!pmd_present(*pmd)) { + struct page *new; + + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pmd_present(*pmd)) { + pte_free(new); + goto out; + } + pgtable_add_rmap(new, mm, address); + pmd_populate(mm, pmd, new); + } +out: + return pte_offset_map(pmd, address); +} + +pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (!pmd_present(*pmd)) { + pte_t *new; + + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one_kernel(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pmd_present(*pmd)) { + pte_free_kernel(new); + goto out; + } + pgtable_add_rmap(virt_to_page(new), mm, address); + pmd_populate_kernel(mm, pmd, new); + } +out: + return pte_offset_kernel(pmd, address); +} +#define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t)) +#define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t)) + +/* + * copy one vm_area from one task to the other. Assumes the page tables + * already present in the new task to be cleared in the whole range + * covered by this vma. + * + * 08Jan98 Merged into one routine from several inline routines to reduce + * variable count and make things faster. -jj + * + * dst->page_table_lock is held on entry and exit, + * but may be dropped within pmd_alloc() and pte_alloc_map(). + */ +int copy_page_range(struct mm_struct *dst, struct mm_struct *src, + struct vm_area_struct *vma) +{ + pgd_t * src_pgd, * dst_pgd; + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + unsigned long cow; + struct pte_chain *pte_chain = NULL; + + if (is_vm_hugetlb_page(vma)) + return copy_hugetlb_page_range(dst, src, vma); + + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + spin_unlock(&dst->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + spin_lock(&dst->page_table_lock); + if (!pte_chain) + goto nomem; + } + + cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; + src_pgd = pgd_offset(src, address)-1; + dst_pgd = pgd_offset(dst, address)-1; + + for (;;) { + pmd_t * src_pmd, * dst_pmd; + + src_pgd++; dst_pgd++; + + /* copy_pmd_range */ + + if (pgd_none(*src_pgd)) + goto skip_copy_pmd_range; + if (pgd_bad(*src_pgd)) { + pgd_ERROR(*src_pgd); + pgd_clear(src_pgd); +skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; + if (!address || (address >= end)) + goto out; + continue; + } + + src_pmd = pmd_offset(src_pgd, address); + dst_pmd = pmd_alloc(dst, dst_pgd, address); + if (!dst_pmd) + goto nomem; + + do { + pte_t * src_pte, * dst_pte; + + /* copy_pte_range */ + + if (pmd_none(*src_pmd)) + goto skip_copy_pte_range; + if (pmd_bad(*src_pmd)) { + pmd_ERROR(*src_pmd); + pmd_clear(src_pmd); +skip_copy_pte_range: + address = (address + PMD_SIZE) & PMD_MASK; + if (address >= end) + goto out; + goto cont_copy_pmd_range; + } + + dst_pte = pte_alloc_map(dst, dst_pmd, address); + if (!dst_pte) + goto nomem; + spin_lock(&src->page_table_lock); + src_pte = pte_offset_map_nested(src_pmd, address); + do { + pte_t pte = *src_pte; + struct page *page; + unsigned long pfn; + + /* copy_one_pte */ + + if (pte_none(pte)) + goto cont_copy_pte_range_noset; + /* pte contains position in swap, so copy. */ + if (!pte_present(pte)) { + if (!pte_file(pte)) + swap_duplicate(pte_to_swp_entry(pte)); + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; + } + pfn = pte_pfn(pte); + /* the pte points outside of valid memory, the + * mapping is assumed to be good, meaningful + * and not mapped via rmap - duplicate the + * mapping as is. + */ + page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + + if (!page || PageReserved(page)) { + set_pte(dst_pte, pte); + goto cont_copy_pte_range_noset; + } + + /* + * If it's a COW mapping, write protect it both + * in the parent and the child + */ + if (cow) { + ptep_set_wrprotect(src_pte); + pte = *src_pte; + } + + /* + * If it's a shared mapping, mark it clean in + * the child + */ + if (vma->vm_flags & VM_SHARED) + pte = pte_mkclean(pte); + pte = pte_mkold(pte); + get_page(page); + dst->rss++; + + set_pte(dst_pte, pte); + pte_chain = page_add_rmap(page, dst_pte, + pte_chain); + if (pte_chain) + goto cont_copy_pte_range_noset; + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (pte_chain) + goto cont_copy_pte_range_noset; + + /* + * pte_chain allocation failed, and we need to + * run page reclaim. + */ + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + spin_unlock(&src->page_table_lock); + spin_unlock(&dst->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + spin_lock(&dst->page_table_lock); + if (!pte_chain) + goto nomem; + spin_lock(&src->page_table_lock); + dst_pte = pte_offset_map(dst_pmd, address); + src_pte = pte_offset_map_nested(src_pmd, + address); +cont_copy_pte_range_noset: + address += PAGE_SIZE; + if (address >= end) { + pte_unmap_nested(src_pte); + pte_unmap(dst_pte); + goto out_unlock; + } + src_pte++; + dst_pte++; + } while ((unsigned long)src_pte & PTE_TABLE_MASK); + pte_unmap_nested(src_pte-1); + pte_unmap(dst_pte-1); + spin_unlock(&src->page_table_lock); + +cont_copy_pmd_range: + src_pmd++; + dst_pmd++; + } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + } +out_unlock: + spin_unlock(&src->page_table_lock); +out: + pte_chain_free(pte_chain); + return 0; +nomem: + pte_chain_free(pte_chain); + return -ENOMEM; +} + +static void +zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd, + unsigned long address, unsigned long size) +{ + unsigned long offset; + pte_t *ptep; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + ptep = pte_offset_map(pmd, address); + offset = address & ~PMD_MASK; + if (offset + size > PMD_SIZE) + size = PMD_SIZE - offset; + size &= PAGE_MASK; + for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) { + pte_t pte = *ptep; + if (pte_none(pte)) + continue; + if (pte_present(pte)) { + unsigned long pfn = pte_pfn(pte); + + pte = ptep_get_and_clear(ptep); + tlb_remove_tlb_entry(tlb, ptep, address+offset); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) { + if (pte_dirty(pte)) + set_page_dirty(page); + if (page->mapping && pte_young(pte) && + !PageSwapCache(page)) + mark_page_accessed(page); + tlb->freed++; + page_remove_rmap(page, ptep); + tlb_remove_page(tlb, page); + } + } + } else { + if (!pte_file(pte)) + free_swap_and_cache(pte_to_swp_entry(pte)); + pte_clear(ptep); + } + } + pte_unmap(ptep-1); +} + +static void +zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, + unsigned long address, unsigned long size) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + end = address + size; + if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) + end = ((address + PGDIR_SIZE) & PGDIR_MASK); + do { + zap_pte_range(tlb, pmd, address, end - address); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address < end); +} + +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long address, unsigned long end) +{ + pgd_t * dir; + + if (is_vm_hugetlb_page(vma)) { + unmap_hugepage_range(vma, address, end); + return; + } + + BUG_ON(address >= end); + + dir = pgd_offset(vma->vm_mm, address); + tlb_start_vma(tlb, vma); + do { + zap_pmd_range(tlb, dir, address, end - address); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + tlb_end_vma(tlb, vma); +} + +/* Dispose of an entire struct mmu_gather per rescheduling point */ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) +#define ZAP_BLOCK_SIZE (FREE_PTE_NR * PAGE_SIZE) +#endif + +/* For UP, 256 pages at a time gives nice low latency */ +#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) +#define ZAP_BLOCK_SIZE (256 * PAGE_SIZE) +#endif + +/* No preempt: go for the best straight-line efficiency */ +#if !defined(CONFIG_PREEMPT) +#define ZAP_BLOCK_SIZE (~(0UL)) +#endif + +/** + * unmap_vmas - unmap a range of memory covered by a list of vma's + * @tlbp: address of the caller's struct mmu_gather + * @mm: the controlling mm_struct + * @vma: the starting vma + * @start_addr: virtual address at which to start unmapping + * @end_addr: virtual address at which to end unmapping + * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here + * + * Returns the number of vma's which were covered by the unmapping. + * + * Unmap all pages in the vma list. Called under page_table_lock. + * + * We aim to not hold page_table_lock for too long (for scheduling latency + * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to + * return the ending mmu_gather to the caller. + * + * Only addresses between `start' and `end' will be unmapped. + * + * The VMA list must be sorted in ascending virtual address order. + * + * unmap_vmas() assumes that the caller will flush the whole unmapped address + * range after unmap_vmas() returns. So the only responsibility here is to + * ensure that any thus-far unmapped pages are flushed before unmap_vmas() + * drops the lock and schedules. + */ +int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long start_addr, + unsigned long end_addr, unsigned long *nr_accounted) +{ + unsigned long zap_bytes = ZAP_BLOCK_SIZE; + unsigned long tlb_start; /* For tlb_finish_mmu */ + int tlb_start_valid = 0; + int ret = 0; + + if (vma) { /* debug. killme. */ + if (end_addr <= vma->vm_start) + printk("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n", + __FUNCTION__, end_addr, vma->vm_start); + if (start_addr >= vma->vm_end) + printk("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n", + __FUNCTION__, start_addr, vma->vm_end); + } + + for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { + unsigned long start; + unsigned long end; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + continue; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + continue; + + if (vma->vm_flags & VM_ACCOUNT) + *nr_accounted += (end - start) >> PAGE_SHIFT; + + ret++; + while (start != end) { + unsigned long block; + + if (is_vm_hugetlb_page(vma)) + block = end - start; + else + block = min(zap_bytes, end - start); + + if (!tlb_start_valid) { + tlb_start = start; + tlb_start_valid = 1; + } + + unmap_page_range(*tlbp, vma, start, start + block); + start += block; + zap_bytes -= block; + if ((long)zap_bytes > 0) + continue; + if (need_resched()) { + tlb_finish_mmu(*tlbp, tlb_start, start); + cond_resched_lock(&mm->page_table_lock); + *tlbp = tlb_gather_mmu(mm, 0); + tlb_start_valid = 0; + } + zap_bytes = ZAP_BLOCK_SIZE; + } + if (vma->vm_next && vma->vm_next->vm_start < vma->vm_end) + printk("%s: VMA list is not sorted correctly!\n", + __FUNCTION__); + } + return ret; +} + +/** + * zap_page_range - remove user pages in a given range + * @vma: vm_area_struct holding the applicable pages + * @address: starting address of pages to zap + * @size: number of bytes to zap + */ +void zap_page_range(struct vm_area_struct *vma, + unsigned long address, unsigned long size) +{ + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather *tlb; + unsigned long end = address + size; + unsigned long nr_accounted = 0; + + might_sleep(); + + if (is_vm_hugetlb_page(vma)) { + zap_hugepage_range(vma, address, size); + return; + } + + lru_add_drain(); + spin_lock(&mm->page_table_lock); + tlb = tlb_gather_mmu(mm, 0); + unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted); + tlb_finish_mmu(tlb, address, end); + spin_unlock(&mm->page_table_lock); +} + +/* + * Do a quick page-table lookup for a single page. + * mm->page_table_lock must be held. + */ +struct page * +follow_page(struct mm_struct *mm, unsigned long address, int write) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *ptep, pte; + unsigned long pfn; + struct vm_area_struct *vma; + + vma = hugepage_vma(mm, address); + if (vma) + return follow_huge_addr(mm, vma, address, write); + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || pgd_bad(*pgd)) + goto out; + + pmd = pmd_offset(pgd, address); + if (pmd_none(*pmd)) + goto out; + if (pmd_huge(*pmd)) + return follow_huge_pmd(mm, address, pmd, write); + if (pmd_bad(*pmd)) + goto out; + + ptep = pte_offset_map(pmd, address); + if (!ptep) + goto out; + + pte = *ptep; + pte_unmap(ptep); + if (pte_present(pte)) { + if (!write || (pte_write(pte) && pte_dirty(pte))) { + pfn = pte_pfn(pte); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + mark_page_accessed(page); + return page; + } + } + } + +out: + return NULL; +} + +/* + * Given a physical address, is there a useful struct page pointing to + * it? This may become more complex in the future if we start dealing + * with IO-aperture pages for direct-IO. + */ + +static inline struct page *get_page_map(struct page *page) +{ + if (!pfn_valid(page_to_pfn(page))) + return 0; + return page; +} + + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int i; + unsigned int flags; + + /* + * Require read or write permissions. + * If 'force' is set, we only require the "MAY" flags. + */ + flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + i = 0; + + do { + struct vm_area_struct * vma; + + vma = find_extend_vma(mm, start); + +#ifdef FIXADDR_USER_START + if (!vma && + start >= FIXADDR_USER_START && start < FIXADDR_USER_END) { + static struct vm_area_struct fixmap_vma = { + /* Catch users - if there are any valid + ones, we can make this be "&init_mm" or + something. */ + .vm_mm = NULL, + .vm_start = FIXADDR_USER_START, + .vm_end = FIXADDR_USER_END, + .vm_page_prot = PAGE_READONLY, + .vm_flags = VM_READ | VM_EXEC, + }; + unsigned long pg = start & PAGE_MASK; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + if (write) /* user fixmap pages are read-only */ + return i ? : -EFAULT; + pgd = pgd_offset_k(pg); + if (!pgd) + return i ? : -EFAULT; + pmd = pmd_offset(pgd, pg); + if (!pmd) + return i ? : -EFAULT; + pte = pte_offset_kernel(pmd, pg); + if (!pte || !pte_present(*pte)) + return i ? : -EFAULT; + if (pages) { + pages[i] = pte_page(*pte); + get_page(pages[i]); + } + if (vmas) + vmas[i] = &fixmap_vma; + i++; + start += PAGE_SIZE; + len--; + continue; + } +#endif + + if (!vma || (pages && (vma->vm_flags & VM_IO)) + || !(flags & vma->vm_flags)) + return i ? : -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &len, i); + continue; + } + spin_lock(&mm->page_table_lock); + do { + struct page *map; + while (!(map = follow_page(mm, start, write))) { + spin_unlock(&mm->page_table_lock); + switch (handle_mm_fault(mm,vma,start,write)) { + case VM_FAULT_MINOR: + tsk->min_flt++; + break; + case VM_FAULT_MAJOR: + tsk->maj_flt++; + break; + case VM_FAULT_SIGBUS: + return i ? i : -EFAULT; + case VM_FAULT_OOM: + return i ? i : -ENOMEM; + default: + BUG(); + } + spin_lock(&mm->page_table_lock); + } + if (pages) { + pages[i] = get_page_map(map); + if (!pages[i]) { + spin_unlock(&mm->page_table_lock); + while (i--) + page_cache_release(pages[i]); + i = -EFAULT; + goto out; + } + flush_dcache_page(pages[i]); + if (!PageReserved(pages[i])) + page_cache_get(pages[i]); + } + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + } while(len && start < vma->vm_end); + spin_unlock(&mm->page_table_lock); + } while(len); +out: + return i; +} + +EXPORT_SYMBOL(get_user_pages); + +static void zeromap_pte_range(pte_t * pte, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long end; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); + BUG_ON(!pte_none(*pte)); + set_pte(pte, zero_pte); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + +static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, + unsigned long size, pgprot_t prot) +{ + unsigned long base, end; + + base = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + pte_t * pte = pte_alloc_map(mm, pmd, base + address); + if (!pte) + return -ENOMEM; + zeromap_pte_range(pte, base + address, end - address, prot); + pte_unmap(pte); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +int zeromap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = address; + unsigned long end = address + size; + struct mm_struct *mm = vma->vm_mm; + + dir = pgd_offset(mm, address); + flush_cache_range(vma, beg, end); + if (address >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, address); + error = -ENOMEM; + if (!pmd) + break; + error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + if (error) + break; + address = (address + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (address && (address < end)); + flush_tlb_range(vma, beg, end); + spin_unlock(&mm->page_table_lock); + return error; +} + +/* + * maps a range of physical memory into the requested pages. the old + * mappings are removed. any references to nonexistent pages results + * in null mappings (currently treated as "copy-on-access") + */ +static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long end; + unsigned long pfn; + + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + pfn = phys_addr >> PAGE_SHIFT; + do { + BUG_ON(!pte_none(*pte)); + if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) + set_pte(pte, pfn_pte(pfn, prot)); + address += PAGE_SIZE; + pfn++; + pte++; + } while (address && (address < end)); +} + +static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) +{ + unsigned long base, end; + + base = address & PGDIR_MASK; + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + phys_addr -= address; + do { + pte_t * pte = pte_alloc_map(mm, pmd, base + address); + if (!pte) + return -ENOMEM; + remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + pte_unmap(pte); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); + return 0; +} + +/* Note: this is only safe if the mm semaphore is held when called. */ +int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot) +{ + int error = 0; + pgd_t * dir; + unsigned long beg = from; + unsigned long end = from + size; + struct mm_struct *mm = vma->vm_mm; + + phys_addr -= from; + dir = pgd_offset(mm, from); + flush_cache_range(vma, beg, end); + if (from >= end) + BUG(); + + spin_lock(&mm->page_table_lock); + do { + pmd_t *pmd = pmd_alloc(mm, dir, from); + error = -ENOMEM; + if (!pmd) + break; + error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + if (error) + break; + from = (from + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (from && (from < end)); + flush_tlb_range(vma, beg, end); + spin_unlock(&mm->page_table_lock); + return error; +} + +EXPORT_SYMBOL(remap_page_range); + +/* + * Establish a new mapping: + * - flush the old one + * - update the page tables + * - inform the TLB about the new one + * + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +{ + set_pte(page_table, entry); + flush_tlb_page(vma, address); + update_mmu_cache(vma, address, entry); +} + +/* + * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock + */ +static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, + pte_t *page_table) +{ + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); +} + +/* + * This routine handles present pages, when users try to write + * to a shared page. It is done by copying the page to a new address + * and decrementing the shared-page counter for the old page. + * + * Goto-purists beware: the only reason for goto's here is that it results + * in better assembly code.. The "default" path will see no jumps at all. + * + * Note that this routine assumes that the protection checks have been + * done by the caller (the low-level page fault routine in most cases). + * Thus we can safely just mark it writable once we've done any necessary + * COW. + * + * We also mark the page dirty at this point even though the page will + * change only once the write actually happens. This avoids a few races, + * and potentially makes it more efficient. + * + * We hold the mm semaphore and the page_table_lock on entry and exit + * with the page_table_lock released. + */ +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) +{ + struct page *old_page, *new_page; + unsigned long pfn = pte_pfn(pte); + struct pte_chain *pte_chain = NULL; + int ret; + + if (unlikely(!pfn_valid(pfn))) { + /* + * This should really halt the system so it can be debugged or + * at least the kernel stops what it's doing before it corrupts + * data, but for the moment just pretend this is OOM. + */ + pte_unmap(page_table); + printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", + address); + goto oom; + } + old_page = pfn_to_page(pfn); + + if (!TestSetPageLocked(old_page)) { + int reuse = can_share_swap_page(old_page); + unlock_page(old_page); + if (reuse) { + flush_cache_page(vma, address); + establish_pte(vma, address, page_table, + pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + pte_unmap(page_table); + ret = VM_FAULT_MINOR; + goto out; + } + } + pte_unmap(page_table); + + /* + * Ok, we need to copy. Oh, well.. + */ + page_cache_get(old_page); + spin_unlock(&mm->page_table_lock); + + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto no_mem; + new_page = alloc_page(GFP_HIGHUSER); + if (!new_page) + goto no_mem; + copy_cow_page(old_page,new_page,address); + + /* + * Re-check the pte - we dropped the lock + */ + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (pte_same(*page_table, pte)) { + if (PageReserved(old_page)) + ++mm->rss; + page_remove_rmap(old_page, page_table); + break_cow(vma, new_page, address, page_table); + pte_chain = page_add_rmap(new_page, page_table, pte_chain); + lru_cache_add_active(new_page); + + /* Free the old page.. */ + new_page = old_page; + } + pte_unmap(page_table); + page_cache_release(new_page); + page_cache_release(old_page); + ret = VM_FAULT_MINOR; + goto out; + +no_mem: + page_cache_release(old_page); +oom: + ret = VM_FAULT_OOM; +out: + spin_unlock(&mm->page_table_lock); + pte_chain_free(pte_chain); + return ret; +} + +/* + * Helper function for invalidate_mmap_range(). + * Both hba and hlen are page numbers in PAGE_SIZE units. + * An hlen of zero blows away the entire portion file after hba. + */ +static void +invalidate_mmap_range_list(struct list_head *head, + unsigned long const hba, + unsigned long const hlen) +{ + struct list_head *curr; + unsigned long hea; /* last page of hole. */ + unsigned long vba; + unsigned long vea; /* last page of corresponding uva hole. */ + struct vm_area_struct *vp; + unsigned long zba; + unsigned long zea; + + hea = hba + hlen - 1; /* avoid overflow. */ + if (hea < hba) + hea = ULONG_MAX; + list_for_each(curr, head) { + vp = list_entry(curr, struct vm_area_struct, shared); + vba = vp->vm_pgoff; + vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1; + if (hea < vba || vea < hba) + continue; /* Mapping disjoint from hole. */ + zba = (hba <= vba) ? vba : hba; + zea = (vea <= hea) ? vea : hea; + zap_page_range(vp, + ((zba - vba) << PAGE_SHIFT) + vp->vm_start, + (zea - zba + 1) << PAGE_SHIFT); + } +} + +/** + * invalidate_mmap_range - invalidate the portion of all mmaps + * in the specified address_space corresponding to the specified + * page range in the underlying file. + * @address_space: the address space containing mmaps to be invalidated. + * @holebegin: byte in first page to invalidate, relative to the start of + * the underlying file. This will be rounded down to a PAGE_SIZE + * boundary. Note that this is different from vmtruncate(), which + * must keep the partial page. In contrast, we must get rid of + * partial pages. + * @holelen: size of prospective hole in bytes. This will be rounded + * up to a PAGE_SIZE boundary. A holelen of zero truncates to the + * end of the file. + */ +void invalidate_mmap_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) +{ + unsigned long hba = holebegin >> PAGE_SHIFT; + unsigned long hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + + /* Check for overflow. */ + if (sizeof(holelen) > sizeof(hlen)) { + long long holeend = + (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; + + if (holeend & ~(long long)ULONG_MAX) + hlen = ULONG_MAX - hba + 1; + } + down(&mapping->i_shared_sem); + /* Protect against page fault */ + atomic_inc(&mapping->truncate_count); + if (unlikely(!list_empty(&mapping->i_mmap))) + invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen); + if (unlikely(!list_empty(&mapping->i_mmap_shared))) + invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen); + up(&mapping->i_shared_sem); +} +EXPORT_SYMBOL_GPL(invalidate_mmap_range); + +/* + * Handle all mappings that got truncated by a "truncate()" + * system call. + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode * inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + unsigned long limit; + + if (inode->i_size < offset) + goto do_expand; + i_size_write(inode, offset); + invalidate_mmap_range(mapping, offset + PAGE_SIZE - 1, 0); + truncate_inode_pages(mapping, offset); + goto out_truncate; + +do_expand: + limit = current->rlim[RLIMIT_FSIZE].rlim_cur; + if (limit != RLIM_INFINITY && offset > limit) + goto out_sig; + if (offset > inode->i_sb->s_maxbytes) + goto out; + i_size_write(inode, offset); + +out_truncate: + if (inode->i_op && inode->i_op->truncate) + inode->i_op->truncate(inode); + return 0; +out_sig: + send_sig(SIGXFSZ, current, 0); +out: + return -EFBIG; +} + +EXPORT_SYMBOL(vmtruncate); + +/* + * Primitive swap readahead code. We simply read an aligned block of + * (1 << page_cluster) entries in the swap area. This method is chosen + * because it doesn't cost us any seek time. We also make sure to queue + * the 'original' request together with the readahead ones... + */ +void swapin_readahead(swp_entry_t entry) +{ + int i, num; + struct page *new_page; + unsigned long offset; + + /* + * Get the number of handles we should do readahead io to. + */ + num = valid_swaphandles(entry, &offset); + for (i = 0; i < num; offset++, i++) { + /* Ok, do the async read-ahead now */ + new_page = read_swap_cache_async(swp_entry(swp_type(entry), + offset)); + if (!new_page) + break; + page_cache_release(new_page); + } + lru_add_drain(); /* Push any new pages onto the LRU now */ +} + +/* + * We hold the mm semaphore and the page_table_lock on entry and + * should release the pagetable lock on exit.. + */ +static int do_swap_page(struct mm_struct * mm, + struct vm_area_struct * vma, unsigned long address, + pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) +{ + struct page *page; + swp_entry_t entry = pte_to_swp_entry(orig_pte); + pte_t pte; + int ret = VM_FAULT_MINOR; + struct pte_chain *pte_chain = NULL; + + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + page = lookup_swap_cache(entry); + if (!page) { + swapin_readahead(entry); + page = read_swap_cache_async(entry); + if (!page) { + /* + * Back out if somebody else faulted in this pte while + * we released the page table lock. + */ + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (pte_same(*page_table, orig_pte)) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_MINOR; + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + goto out; + } + + /* Had to read the page from swap area: Major fault */ + ret = VM_FAULT_MAJOR; + inc_page_state(pgmajfault); + } + + mark_page_accessed(page); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) { + ret = -ENOMEM; + goto out; + } + lock_page(page); + + /* + * Back out if somebody else faulted in this pte while we + * released the page table lock. + */ + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, address); + if (!pte_same(*page_table, orig_pte)) { + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + ret = VM_FAULT_MINOR; + goto out; + } + + /* The page isn't present yet, go ahead with the fault. */ + + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + + mm->rss++; + pte = mk_pte(page, vma->vm_page_prot); + if (write_access && can_share_swap_page(page)) + pte = pte_mkdirty(pte_mkwrite(pte)); + unlock_page(page); + + flush_icache_page(vma, page); + set_pte(page_table, pte); + pte_chain = page_add_rmap(page, page_table, pte_chain); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); +out: + pte_chain_free(pte_chain); + return ret; +} + +/* + * We are called with the MM semaphore and page_table_lock + * spinlock held to protect against concurrent faults in + * multithreaded programs. + */ +static int +do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + pte_t *page_table, pmd_t *pmd, int write_access, + unsigned long addr) +{ + pte_t entry; + struct page * page = ZERO_PAGE(addr); + struct pte_chain *pte_chain; + int ret; + + pte_chain = pte_chain_alloc(GFP_ATOMIC); + if (!pte_chain) { + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto no_mem; + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); + } + + /* Read-only mapping of ZERO_PAGE. */ + entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + + /* ..except if it's a write access */ + if (write_access) { + /* Allocate our own private page. */ + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto no_mem; + clear_user_highpage(page, addr); + + spin_lock(&mm->page_table_lock); + page_table = pte_offset_map(pmd, addr); + + if (!pte_none(*page_table)) { + pte_unmap(page_table); + page_cache_release(page); + spin_unlock(&mm->page_table_lock); + ret = VM_FAULT_MINOR; + goto out; + } + mm->rss++; + entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + lru_cache_add_active(page); + mark_page_accessed(page); + } + + set_pte(page_table, entry); + /* ignores ZERO_PAGE */ + pte_chain = page_add_rmap(page, page_table, pte_chain); + pte_unmap(page_table); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, entry); + spin_unlock(&mm->page_table_lock); + ret = VM_FAULT_MINOR; + goto out; + +no_mem: + ret = VM_FAULT_OOM; +out: + pte_chain_free(pte_chain); + return ret; +} + +/* + * do_no_page() tries to create a new page mapping. It aggressively + * tries to share with existing pages, but makes a separate copy if + * the "write_access" parameter is true in order to avoid the next + * page fault. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. + * + * This is called with the MM semaphore held and the page table + * spinlock held. Exit with the spinlock released. + */ +static int +do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) +{ + struct page * new_page; + struct address_space *mapping = NULL; + pte_t entry; + struct pte_chain *pte_chain; + int sequence = 0; + int ret; + + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, page_table, + pmd, write_access, address); + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + + if (vma->vm_file) { + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + sequence = atomic_read(&mapping->truncate_count); + } + smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */ +retry: + new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); + + /* no page was available -- either SIGBUS or OOM */ + if (new_page == NOPAGE_SIGBUS) + return VM_FAULT_SIGBUS; + if (new_page == NOPAGE_OOM) + return VM_FAULT_OOM; + + pte_chain = pte_chain_alloc(GFP_KERNEL); + if (!pte_chain) + goto oom; + + /* + * Should we do an early C-O-W break? + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) { + struct page * page = alloc_page(GFP_HIGHUSER); + if (!page) { + page_cache_release(new_page); + goto oom; + } + copy_user_highpage(page, new_page, address); + page_cache_release(new_page); + lru_cache_add_active(page); + new_page = page; + } + + spin_lock(&mm->page_table_lock); + /* + * For a file-backed vma, someone could have truncated or otherwise + * invalidated this page. If invalidate_mmap_range got called, + * retry getting the page. + */ + if (mapping && + (unlikely(sequence != atomic_read(&mapping->truncate_count)))) { + sequence = atomic_read(&mapping->truncate_count); + spin_unlock(&mm->page_table_lock); + page_cache_release(new_page); + pte_chain_free(pte_chain); + goto retry; + } + page_table = pte_offset_map(pmd, address); + + /* + * This silly early PAGE_DIRTY setting removes a race + * due to the bad i386 page protection. But it's valid + * for other architectures too. + * + * Note that if write_access is true, we either now have + * an exclusive copy of the page, or this is a shared mapping, + * so we can make it writable and dirty to avoid having to + * handle that later. + */ + /* Only go through if we didn't race with anybody else... */ + if (pte_none(*page_table)) { + if (!PageReserved(new_page)) + ++mm->rss; + flush_icache_page(vma, new_page); + entry = mk_pte(new_page, vma->vm_page_prot); + if (write_access) + entry = pte_mkwrite(pte_mkdirty(entry)); + set_pte(page_table, entry); + pte_chain = page_add_rmap(new_page, page_table, pte_chain); + pte_unmap(page_table); + } else { + /* One of our sibling threads was faster, back out. */ + pte_unmap(page_table); + page_cache_release(new_page); + spin_unlock(&mm->page_table_lock); + ret = VM_FAULT_MINOR; + goto out; + } + + /* no need to invalidate: a not-present page shouldn't be cached */ + update_mmu_cache(vma, address, entry); + spin_unlock(&mm->page_table_lock); + ret = VM_FAULT_MAJOR; + goto out; +oom: + ret = VM_FAULT_OOM; +out: + pte_chain_free(pte_chain); + return ret; +} + +/* + * Fault of a previously existing named mapping. Repopulate the pte + * from the encoded file_pte if possible. This enables swappable + * nonlinear vmas. + */ +static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) +{ + unsigned long pgoff; + int err; + + BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); + /* + * Fall back to the linear mapping if the fs does not support + * ->populate: + */ + if (!vma->vm_ops || !vma->vm_ops->populate || + (write_access && !(vma->vm_flags & VM_SHARED))) { + pte_clear(pte); + return do_no_page(mm, vma, address, write_access, pte, pmd); + } + + pgoff = pte_to_pgoff(*pte); + + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); + + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err) + return VM_FAULT_SIGBUS; + return VM_FAULT_MAJOR; +} + +/* + * These routines also need to handle stuff like marking pages dirty + * and/or accessed for architectures that don't do it in hardware (most + * RISC architectures). The early dirtying is also good on the i386. + * + * There is also a hook called "update_mmu_cache()" that architectures + * with external mmu caches can use to update those (ie the Sparc or + * PowerPC hashed page tables that act as extended TLBs). + * + * Note the "page_table_lock". It is to protect against kswapd removing + * pages from under us. Note that kswapd only ever _removes_ pages, never + * adds them. As such, once we have noticed that the page is not present, + * we can drop the lock early. + * + * The adding of pages is protected by the MM semaphore (which we hold), + * so we don't need to worry about a page being suddenly been added into + * our VM. + * + * We enter with the pagetable spinlock held, we are supposed to + * release it when done. + */ +static inline int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct * vma, unsigned long address, + int write_access, pte_t *pte, pmd_t *pmd) +{ + pte_t entry; + + entry = *pte; + if (!pte_present(entry)) { + /* + * If it truly wasn't present, we know that kswapd + * and the PTE updates will not touch it later. So + * drop the lock. + */ + if (pte_none(entry)) + return do_no_page(mm, vma, address, write_access, pte, pmd); + if (pte_file(entry)) + return do_file_page(mm, vma, address, write_access, pte, pmd); + return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); + } + + if (write_access) { + if (!pte_write(entry)) + return do_wp_page(mm, vma, address, pte, pmd, entry); + + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + establish_pte(vma, address, pte, entry); + pte_unmap(pte); + spin_unlock(&mm->page_table_lock); + return VM_FAULT_MINOR; +} + +/* + * By the time we get here, we already hold the mm semaphore + */ +int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, + unsigned long address, int write_access) +{ + pgd_t *pgd; + pmd_t *pmd; + + __set_current_state(TASK_RUNNING); + pgd = pgd_offset(mm, address); + + inc_page_state(pgfault); + + if (is_vm_hugetlb_page(vma)) + return VM_FAULT_SIGBUS; /* mapping truncation does this. */ + + /* + * We need the page table lock to synchronize with kswapd + * and the SMP-safe atomic PTE updates. + */ + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc(mm, pgd, address); + + if (pmd) { + pte_t * pte = pte_alloc_map(mm, pmd, address); + if (pte) + return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + } + spin_unlock(&mm->page_table_lock); + return VM_FAULT_OOM; +} + +/* + * Allocate page middle directory. + * + * We've already handled the fast-path in-line, and we own the + * page table lock. + * + * On a two-level page table, this ends up actually being entirely + * optimized away. + */ +pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + pmd_t *new; + + spin_unlock(&mm->page_table_lock); + new = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pmd_free(new); + goto out; + } + pgd_populate(mm, pgd, new); +out: + return pmd_offset(pgd, address); +} + +int make_pages_present(unsigned long addr, unsigned long end) +{ + int ret, len, write; + struct vm_area_struct * vma; + + vma = find_vma(current->mm, addr); + write = (vma->vm_flags & VM_WRITE) != 0; + if (addr >= end) + BUG(); + if (end > vma->vm_end) + BUG(); + len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; + ret = get_user_pages(current, current->mm, addr, + len, write, 0, NULL, NULL); + if (ret < 0) + return ret; + return ret == len ? 0 : -1; +} + +/* + * Map a vmalloc()-space virtual address to the physical page. + */ +struct page * vmalloc_to_page(void * vmalloc_addr) +{ + unsigned long addr = (unsigned long) vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd = pgd_offset_k(addr); + pmd_t *pmd; + pte_t *ptep, pte; + + if (!pgd_none(*pgd)) { + pmd = pmd_offset(pgd, addr); + if (!pmd_none(*pmd)) { + preempt_disable(); + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + preempt_enable(); + } + } + return page; +} + +EXPORT_SYMBOL(vmalloc_to_page); diff -Nru a/mm/mmap.c b/mm/mmap.c --- a/mm/mmap.c Fri Oct 31 14:10:54 2003 +++ b/mm/mmap.c Fri Oct 31 14:10:54 2003 @@ -461,11 +461,11 @@ * The caller must hold down_write(current->mm->mmap_sem). */ -unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flags, unsigned long pgoff) +unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file * file, + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long pgoff) { - struct mm_struct * mm = current->mm; struct vm_area_struct * vma, * prev; struct inode *inode; unsigned int vm_flags; diff -Nru a/mm/mmap.c~do_mmap2-fix.diff b/mm/mmap.c~do_mmap2-fix.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/mmap.c~do_mmap2-fix.diff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,1435 @@ +/* + * mm/mmap.c + * + * Written by obz. + * + * Address space accounting code <alan@redhat.com> + */ + +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/init.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/personality.h> +#include <linux/security.h> +#include <linux/hugetlb.h> +#include <linux/profile.h> +#include <linux/module.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/tlb.h> + +/* + * WARNING: the debugging will use recursive algorithms so never enable this + * unless you know what you are doing. + */ +#undef DEBUG_MM_RB + +/* description of effects of mapping type and prot in current implementation. + * this is due to the limited x86 page protection hardware. The expected + * behavior is in parens: + * + * map_type prot + * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC + * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (yes) yes w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes + * w: (no) no w: (no) no w: (copy) copy w: (no) no + * x: (no) no x: (no) yes x: (no) yes x: (yes) yes + * + */ +pgprot_t protection_map[16] = { + __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, + __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 +}; + +int sysctl_overcommit_memory = 0; /* default is heuristic overcommit */ +int sysctl_overcommit_ratio = 50; /* default is 50% */ +atomic_t vm_committed_space = ATOMIC_INIT(0); + +EXPORT_SYMBOL(sysctl_overcommit_memory); +EXPORT_SYMBOL(sysctl_overcommit_ratio); +EXPORT_SYMBOL(vm_committed_space); + +/* + * Requires inode->i_mapping->i_shared_sem + */ +static inline void +__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) +{ + if (inode) { + if (vma->vm_flags & VM_DENYWRITE) + atomic_inc(&inode->i_writecount); + list_del_init(&vma->shared); + } +} + +/* + * Remove one vm structure from the inode's i_mapping address space. + */ +static void remove_shared_vm_struct(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + + if (file) { + struct inode *inode = file->f_dentry->d_inode; + + down(&inode->i_mapping->i_shared_sem); + __remove_shared_vm_struct(vma, inode); + up(&inode->i_mapping->i_shared_sem); + } +} + +/* + * sys_brk() for the most part doesn't need the global kernel + * lock, except when an application is doing something nasty + * like trying to un-brk an area that has already been mapped + * to a regular file. in this case, the unmapping will need + * to invoke file system routines that need the global lock. + */ +asmlinkage unsigned long sys_brk(unsigned long brk) +{ + unsigned long rlim, retval; + unsigned long newbrk, oldbrk; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + + if (brk < mm->end_code) + goto out; + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); + if (oldbrk == newbrk) + goto set_brk; + + /* Always allow shrinking brk. */ + if (brk <= mm->brk) { + if (!do_munmap(mm, newbrk, oldbrk-newbrk)) + goto set_brk; + goto out; + } + + /* Check against rlimit.. */ + rlim = current->rlim[RLIMIT_DATA].rlim_cur; + if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) + goto out; + + /* Check against existing mmap mappings. */ + if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) + goto out; + + /* Ok, looks good - let it rip. */ + if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + goto out; +set_brk: + mm->brk = brk; +out: + retval = mm->brk; + up_write(&mm->mmap_sem); + return retval; +} + +#ifdef DEBUG_MM_RB +static int browse_rb(struct rb_node * rb_node) { + int i = 0; + if (rb_node) { + i++; + i += browse_rb(rb_node->rb_left); + i += browse_rb(rb_node->rb_right); + } + return i; +} + +static void validate_mm(struct mm_struct * mm) { + int bug = 0; + int i = 0; + struct vm_area_struct * tmp = mm->mmap; + while (tmp) { + tmp = tmp->vm_next; + i++; + } + if (i != mm->map_count) + printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + i = browse_rb(mm->mm_rb.rb_node); + if (i != mm->map_count) + printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (bug) + BUG(); +} +#else +#define validate_mm(mm) do { } while (0) +#endif + +static struct vm_area_struct * +find_vma_prepare(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev, struct rb_node ***rb_link, + struct rb_node ** rb_parent) +{ + struct vm_area_struct * vma; + struct rb_node ** __rb_link, * __rb_parent, * rb_prev; + + __rb_link = &mm->mm_rb.rb_node; + rb_prev = __rb_parent = NULL; + vma = NULL; + + while (*__rb_link) { + struct vm_area_struct *vma_tmp; + + __rb_parent = *__rb_link; + vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + return vma; + __rb_link = &__rb_parent->rb_left; + } else { + rb_prev = __rb_parent; + __rb_link = &__rb_parent->rb_right; + } + } + + *pprev = NULL; + if (rb_prev) + *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); + *rb_link = __rb_link; + *rb_parent = __rb_parent; + return vma; +} + +static inline void +__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node *rb_parent) +{ + if (prev) { + vma->vm_next = prev->vm_next; + prev->vm_next = vma; + } else { + mm->mmap = vma; + if (rb_parent) + vma->vm_next = rb_entry(rb_parent, + struct vm_area_struct, vm_rb); + else + vma->vm_next = NULL; + } +} + +static void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, + struct rb_node **rb_link, struct rb_node *rb_parent) +{ + rb_link_node(&vma->vm_rb, rb_parent, rb_link); + rb_insert_color(&vma->vm_rb, &mm->mm_rb); +} + +static inline void __vma_link_file(struct vm_area_struct *vma) +{ + struct file * file; + + file = vma->vm_file; + if (file) { + struct inode * inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + + if (vma->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + + if (vma->vm_flags & VM_SHARED) + list_add_tail(&vma->shared, &mapping->i_mmap_shared); + else + list_add_tail(&vma->shared, &mapping->i_mmap); + } +} + +static void +__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_rb(mm, vma, rb_link, rb_parent); + __vma_link_file(vma); +} + +static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, struct rb_node **rb_link, + struct rb_node *rb_parent) +{ + struct address_space *mapping = NULL; + + if (vma->vm_file) + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + + if (mapping) + down(&mapping->i_shared_sem); + spin_lock(&mm->page_table_lock); + __vma_link(mm, vma, prev, rb_link, rb_parent); + spin_unlock(&mm->page_table_lock); + if (mapping) + up(&mapping->i_shared_sem); + + mark_mm_hugetlb(mm, vma); + mm->map_count++; + validate_mm(mm); +} + +/* + * If the vma has a ->close operation then the driver probably needs to release + * per-vma resources, so we don't attempt to merge those. + */ +#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) + +static inline int is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, unsigned long vm_flags) +{ + if (vma->vm_ops && vma->vm_ops->close) + return 0; + if (vma->vm_file != file) + return 0; + if (vma->vm_flags != vm_flags) + return 0; + if (vma->vm_private_data) + return 0; + return 1; +} + +/* + * Return true if we can merge this (vm_flags,file,vm_pgoff,size) + * in front of (at a lower virtual address and file offset than) the vma. + * + * We don't check here for the merged mmap wrapping around the end of pagecache + * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which + * wrap, nor mmaps which cover the final page at index -1UL. + */ +static int +can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, + struct file *file, unsigned long vm_pgoff, unsigned long size) +{ + if (is_mergeable_vma(vma, file, vm_flags)) { + if (!file) + return 1; /* anon mapping */ + if (vma->vm_pgoff == vm_pgoff + size) + return 1; + } + return 0; +} + +/* + * Return true if we can merge this (vm_flags,file,vm_pgoff) + * beyond (at a higher virtual address and file offset than) the vma. + */ +static int +can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, + struct file *file, unsigned long vm_pgoff) +{ + if (is_mergeable_vma(vma, file, vm_flags)) { + unsigned long vma_size; + + if (!file) + return 1; /* anon mapping */ + + vma_size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + if (vma->vm_pgoff + vma_size == vm_pgoff) + return 1; + } + return 0; +} + +/* + * Given a new mapping request (addr,end,vm_flags,file,pgoff), figure out + * whether that can be merged with its predecessor or its successor. Or + * both (it neatly fills a hole). + */ +static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, + struct rb_node *rb_parent, unsigned long addr, + unsigned long end, unsigned long vm_flags, + struct file *file, unsigned long pgoff) +{ + spinlock_t * lock = &mm->page_table_lock; + + /* + * We later require that vma->vm_flags == vm_flags, so this tests + * vma->vm_flags & VM_SPECIAL, too. + */ + if (vm_flags & VM_SPECIAL) + return 0; + + if (!prev) { + prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); + goto merge_next; + } + + /* + * Can it merge with the predecessor? + */ + if (prev->vm_end == addr && + is_mergeable_vma(prev, file, vm_flags) && + can_vma_merge_after(prev, vm_flags, file, pgoff)) { + struct vm_area_struct *next; + struct inode *inode = file ? file->f_dentry->d_inode : NULL; + int need_up = 0; + + if (unlikely(file && prev->vm_next && + prev->vm_next->vm_file == file)) { + down(&inode->i_mapping->i_shared_sem); + need_up = 1; + } + spin_lock(lock); + prev->vm_end = end; + + /* + * OK, it did. Can we now merge in the successor as well? + */ + next = prev->vm_next; + if (next && prev->vm_end == next->vm_start && + can_vma_merge_before(next, vm_flags, file, + pgoff, (end - addr) >> PAGE_SHIFT)) { + prev->vm_end = next->vm_end; + __vma_unlink(mm, next, prev); + __remove_shared_vm_struct(next, inode); + spin_unlock(lock); + if (need_up) + up(&inode->i_mapping->i_shared_sem); + if (file) + fput(file); + + mm->map_count--; + kmem_cache_free(vm_area_cachep, next); + return 1; + } + spin_unlock(lock); + if (need_up) + up(&inode->i_mapping->i_shared_sem); + return 1; + } + + /* + * Can this new request be merged in front of prev->vm_next? + */ + prev = prev->vm_next; + if (prev) { + merge_next: + if (!can_vma_merge_before(prev, vm_flags, file, + pgoff, (end - addr) >> PAGE_SHIFT)) + return 0; + if (end == prev->vm_start) { + spin_lock(lock); + prev->vm_start = addr; + prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; + spin_unlock(lock); + return 1; + } + } + + return 0; +} + +/* + * The caller must hold down_write(current->mm->mmap_sem). + */ + +unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, + unsigned long len, unsigned long prot, + unsigned long flags, unsigned long pgoff) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + struct inode *inode; + unsigned int vm_flags; + int correct_wcount = 0; + int error; + struct rb_node ** rb_link, * rb_parent; + unsigned long charged = 0; + + if (file && (!file->f_op || !file->f_op->mmap)) + return -ENODEV; + + if (!len) + return addr; + + /* Careful about overflows.. */ + len = PAGE_ALIGN(len); + if (!len || len > TASK_SIZE) + return -EINVAL; + + /* offset overflow? */ + if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) + return -EINVAL; + + /* Too many mappings? */ + if (mm->map_count > MAX_MAP_COUNT) + return -ENOMEM; + + /* Obtain the address to map to. we verify (or select) it and ensure + * that it represents a valid section of the address space. + */ + addr = get_unmapped_area(file, addr, len, pgoff, flags); + if (addr & ~PAGE_MASK) + return addr; + + /* Do simple checking here so the lower-level routines won't have + * to. we assume access permissions have been handled by the open + * of the memory object, so we don't do any here. + */ + vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + + if (flags & MAP_LOCKED) { + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + vm_flags |= VM_LOCKED; + } + /* mlock MCL_FUTURE? */ + if (vm_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + inode = file ? file->f_dentry->d_inode : NULL; + + if (file) { + switch (flags & MAP_TYPE) { + case MAP_SHARED: + if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) + return -EACCES; + + /* + * Make sure we don't allow writing to an append-only + * file.. + */ + if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE)) + return -EACCES; + + /* + * Make sure there are no mandatory locks on the file. + */ + if (locks_verify_locked(inode)) + return -EAGAIN; + + vm_flags |= VM_SHARED | VM_MAYSHARE; + if (!(file->f_mode & FMODE_WRITE)) + vm_flags &= ~(VM_MAYWRITE | VM_SHARED); + + /* fall through */ + case MAP_PRIVATE: + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + break; + + default: + return -EINVAL; + } + } else { + vm_flags |= VM_SHARED | VM_MAYSHARE; + switch (flags & MAP_TYPE) { + default: + return -EINVAL; + case MAP_PRIVATE: + vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + /* fall through */ + case MAP_SHARED: + break; + } + } + + error = security_file_mmap(file, prot, flags); + if (error) + return error; + + /* Clear old maps */ + error = -ENOMEM; +munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limit. */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory > 1) { + if (vm_flags & VM_SHARED) { + /* Check memory availability in shmem_file_setup? */ + vm_flags |= VM_ACCOUNT; + } else if (vm_flags & VM_WRITE) { + /* + * Private writable mapping: check memory availability + */ + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + } + + /* Can we just expand an old anonymous mapping? */ + if (!file && !(vm_flags & VM_SHARED) && rb_parent) + if (vma_merge(mm, prev, rb_parent, addr, addr + len, + vm_flags, NULL, 0)) + goto out; + + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + error = -ENOMEM; + if (!vma) + goto unacct_error; + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = vm_flags; + vma->vm_page_prot = protection_map[vm_flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_pgoff = pgoff; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + vma->vm_next = NULL; + INIT_LIST_HEAD(&vma->shared); + + if (file) { + error = -EINVAL; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + goto free_vma; + if (vm_flags & VM_DENYWRITE) { + error = deny_write_access(file); + if (error) + goto free_vma; + correct_wcount = 1; + } + vma->vm_file = file; + get_file(file); + error = file->f_op->mmap(file, vma); + if (error) + goto unmap_and_free_vma; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } + + /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform + * shmem_zero_setup (perhaps called through /dev/zero's ->mmap) + * that memory reservation must be checked; but that reservation + * belongs to shared memory object, not to vma: so now clear it. + */ + if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT)) + vma->vm_flags &= ~VM_ACCOUNT; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + + if (!file || !rb_parent || !vma_merge(mm, prev, rb_parent, addr, + addr + len, vma->vm_flags, file, pgoff)) { + vma_link(mm, vma, prev, rb_link, rb_parent); + if (correct_wcount) + atomic_inc(&inode->i_writecount); + } else { + if (file) { + if (correct_wcount) + atomic_inc(&inode->i_writecount); + fput(file); + } + kmem_cache_free(vm_area_cachep, vma); + } +out: + mm->total_vm += len >> PAGE_SHIFT; + if (vm_flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + if (flags & MAP_POPULATE) { + up_write(&mm->mmap_sem); + sys_remap_file_pages(addr, len, prot, + pgoff, flags & MAP_NONBLOCK); + down_write(&mm->mmap_sem); + } + return addr; + +unmap_and_free_vma: + if (correct_wcount) + atomic_inc(&inode->i_writecount); + vma->vm_file = NULL; + fput(file); + + /* Undo any partial mapping done by a device driver. */ + zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); +free_vma: + kmem_cache_free(vm_area_cachep, vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + return error; +} + +EXPORT_SYMBOL(do_mmap_pgoff); + +/* Get an address range which is currently unmapped. + * For shmat() with addr=0. + * + * Ugly calling convention alert: + * Return value with the low bits set means error value, + * ie + * if (ret & ~PAGE_MASK) + * error = ret; + * + * This function "knows" that -ENOMEM has the bits set. + */ +#ifndef HAVE_ARCH_UNMAPPED_AREA +static inline unsigned long +arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long start_addr; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + start_addr = addr = mm->free_area_cache; + +full_search: + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) { + /* + * Start a new search - just in case we missed + * some holes. + */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = addr = TASK_UNMAPPED_BASE; + goto full_search; + } + return -ENOMEM; + } + if (!vma || addr + len <= vma->vm_start) { + /* + * Remember the place where we stopped the search: + */ + mm->free_area_cache = addr + len; + return addr; + } + addr = vma->vm_end; + } +} +#else +extern unsigned long +arch_get_unmapped_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); +#endif + +unsigned long +get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + if (flags & MAP_FIXED) { + unsigned long ret; + + if (addr > TASK_SIZE - len) + return -ENOMEM; + if (addr & ~PAGE_MASK) + return -EINVAL; + if (file && is_file_hugepages(file)) { + /* + * Make sure that addr and length are properly aligned. + */ + ret = is_aligned_hugepage_range(addr, len); + } else { + /* + * Ensure that a normal request is not falling in a + * reserved hugepage range. For some archs like IA-64, + * there is a separate region for hugepages. + */ + ret = is_hugepage_only_range(addr, len); + } + if (ret) + return -EINVAL; + return addr; + } + + if (file && file->f_op && file->f_op->get_unmapped_area) + return file->f_op->get_unmapped_area(file, addr, len, + pgoff, flags); + + return arch_get_unmapped_area(file, addr, len, pgoff, flags); +} + +EXPORT_SYMBOL(get_unmapped_area); + +/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct *vma = NULL; + + if (mm) { + /* Check the cache first. */ + /* (Cache hit rate is typically around 35%.) */ + vma = mm->mmap_cache; + if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { + struct rb_node * rb_node; + + rb_node = mm->mm_rb.rb_node; + vma = NULL; + + while (rb_node) { + struct vm_area_struct * vma_tmp; + + vma_tmp = rb_entry(rb_node, + struct vm_area_struct, vm_rb); + + if (vma_tmp->vm_end > addr) { + vma = vma_tmp; + if (vma_tmp->vm_start <= addr) + break; + rb_node = rb_node->rb_left; + } else + rb_node = rb_node->rb_right; + } + if (vma) + mm->mmap_cache = vma; + } + } + return vma; +} + +EXPORT_SYMBOL(find_vma); + +/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +struct vm_area_struct * +find_vma_prev(struct mm_struct *mm, unsigned long addr, + struct vm_area_struct **pprev) +{ + struct vm_area_struct *vma = NULL, *prev = NULL; + struct rb_node * rb_node; + if (!mm) + goto out; + + /* Guard against addr being lower than the first VMA */ + vma = mm->mmap; + + /* Go through the RB tree quickly. */ + rb_node = mm->mm_rb.rb_node; + + while (rb_node) { + struct vm_area_struct *vma_tmp; + vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); + + if (addr < vma_tmp->vm_end) { + rb_node = rb_node->rb_left; + } else { + prev = vma_tmp; + if (!prev->vm_next || (addr < prev->vm_next->vm_end)) + break; + rb_node = rb_node->rb_right; + } + } + +out: + *pprev = prev; + return prev ? prev->vm_next : vma; +} + +#ifdef CONFIG_STACK_GROWSUP +/* + * vma is the first one with address > vma->vm_end. Have to extend vma. + */ +int expand_stack(struct vm_area_struct * vma, unsigned long address) +{ + unsigned long grow; + + if (!(vma->vm_flags & VM_GROWSUP)) + return -EFAULT; + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need to get + * the spinlock only before relocating the vma range ourself. + */ + address += 4 + PAGE_SIZE - 1; + address &= PAGE_MASK; + spin_lock(&vma->vm_mm->page_table_lock); + grow = (address - vma->vm_end) >> PAGE_SHIFT; + + /* Overcommit.. */ + if (security_vm_enough_memory(grow)) { + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + + if (address - vma->vm_start > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + current->rlim[RLIMIT_AS].rlim_cur) { + spin_unlock(&vma->vm_mm->page_table_lock); + vm_unacct_memory(grow); + return -ENOMEM; + } + vma->vm_end = address; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma, *prev; + + addr &= PAGE_MASK; + vma = find_vma_prev(mm, addr, &prev); + if (vma && (vma->vm_start <= addr)) + return vma; + if (!prev || expand_stack(prev, addr)) + return NULL; + if (prev->vm_flags & VM_LOCKED) { + make_pages_present(addr, prev->vm_end); + } + return prev; +} +#else +/* + * vma is the first one with address < vma->vm_start. Have to extend vma. + */ +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + unsigned long grow; + + /* + * vma->vm_start/vm_end cannot change under us because the caller + * is required to hold the mmap_sem in read mode. We need to get + * the spinlock only before relocating the vma range ourself. + */ + address &= PAGE_MASK; + spin_lock(&vma->vm_mm->page_table_lock); + grow = (vma->vm_start - address) >> PAGE_SHIFT; + + /* Overcommit.. */ + if (security_vm_enough_memory(grow)) { + spin_unlock(&vma->vm_mm->page_table_lock); + return -ENOMEM; + } + + if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > + current->rlim[RLIMIT_AS].rlim_cur) { + spin_unlock(&vma->vm_mm->page_table_lock); + vm_unacct_memory(grow); + return -ENOMEM; + } + vma->vm_start = address; + vma->vm_pgoff -= grow; + vma->vm_mm->total_vm += grow; + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm += grow; + spin_unlock(&vma->vm_mm->page_table_lock); + return 0; +} + +struct vm_area_struct * +find_extend_vma(struct mm_struct * mm, unsigned long addr) +{ + struct vm_area_struct * vma; + unsigned long start; + + addr &= PAGE_MASK; + vma = find_vma(mm,addr); + if (!vma) + return NULL; + if (vma->vm_start <= addr) + return vma; + if (!(vma->vm_flags & VM_GROWSDOWN)) + return NULL; + start = vma->vm_start; + if (expand_stack(vma, addr)) + return NULL; + if (vma->vm_flags & VM_LOCKED) { + make_pages_present(addr, start); + } + return vma; +} +#endif + +/* + * Try to free as many page directory entries as we can, + * without having to work very hard at actually scanning + * the page tables themselves. + * + * Right now we try to free page tables if we have a nice + * PGDIR-aligned area that got free'd up. We could be more + * granular if we want to, but this is fast and simple, + * and covers the bad cases. + * + * "prev", if it exists, points to a vma before the one + * we just free'd - but there's no telling how much before. + */ +static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, + unsigned long start, unsigned long end) +{ + unsigned long first = start & PGDIR_MASK; + unsigned long last = end + PGDIR_SIZE - 1; + unsigned long start_index, end_index; + struct mm_struct *mm = tlb->mm; + + if (!prev) { + prev = mm->mmap; + if (!prev) + goto no_mmaps; + if (prev->vm_end > start) { + if (last > prev->vm_start) + last = prev->vm_start; + goto no_mmaps; + } + } + for (;;) { + struct vm_area_struct *next = prev->vm_next; + + if (next) { + if (next->vm_start < start) { + prev = next; + continue; + } + if (last > next->vm_start) + last = next->vm_start; + } + if (prev->vm_end > first) + first = prev->vm_end + PGDIR_SIZE - 1; + break; + } +no_mmaps: + if (last < first) /* for arches with discontiguous pgd indices */ + return; + /* + * If the PGD bits are not consecutive in the virtual address, the + * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. + */ + start_index = pgd_index(first); + if (start_index < FIRST_USER_PGD_NR) + start_index = FIRST_USER_PGD_NR; + end_index = pgd_index(last); + if (end_index > start_index) { + clear_page_tables(tlb, start_index, end_index - start_index); + flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); + } +} + +/* Normal function to fix up a mapping + * This function is the default for when an area has no specific + * function. This may be used as part of a more specific routine. + * + * By the time this function is called, the area struct has been + * removed from the process mapping list. + */ +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) +{ + size_t len = area->vm_end - area->vm_start; + + area->vm_mm->total_vm -= len >> PAGE_SHIFT; + if (area->vm_flags & VM_LOCKED) + area->vm_mm->locked_vm -= len >> PAGE_SHIFT; + /* + * Is this a new hole at the lowest possible address? + */ + if (area->vm_start >= TASK_UNMAPPED_BASE && + area->vm_start < area->vm_mm->free_area_cache) + area->vm_mm->free_area_cache = area->vm_start; + + remove_shared_vm_struct(area); + + if (area->vm_ops && area->vm_ops->close) + area->vm_ops->close(area); + if (area->vm_file) + fput(area->vm_file); + kmem_cache_free(vm_area_cachep, area); +} + +/* + * Update the VMA and inode share lists. + * + * Ok - we have the memory areas we should free on the 'free' list, + * so release them, and do the vma updates. + */ +static void unmap_vma_list(struct mm_struct *mm, + struct vm_area_struct *mpnt) +{ + do { + struct vm_area_struct *next = mpnt->vm_next; + unmap_vma(mm, mpnt); + mpnt = next; + } while (mpnt != NULL); + validate_mm(mm); +} + +/* + * Get rid of page table information in the indicated region. + * + * Called with the page table lock held. + */ +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, + struct vm_area_struct *prev, + unsigned long start, + unsigned long end) +{ + struct mmu_gather *tlb; + unsigned long nr_accounted = 0; + + lru_add_drain(); + tlb = tlb_gather_mmu(mm, 0); + unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted); + vm_unacct_memory(nr_accounted); + free_pgtables(tlb, prev, start, end); + tlb_finish_mmu(tlb, start, end); +} + +/* + * Create a list of vma's touched by the unmap, removing them from the mm's + * vma list as we go.. + * + * Called with the page_table_lock held. + */ +static void +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, + struct vm_area_struct *prev, unsigned long end) +{ + struct vm_area_struct **insertion_point; + struct vm_area_struct *tail_vma = NULL; + + insertion_point = (prev ? &prev->vm_next : &mm->mmap); + do { + rb_erase(&vma->vm_rb, &mm->mm_rb); + mm->map_count--; + tail_vma = vma; + vma = vma->vm_next; + } while (vma && vma->vm_start < end); + *insertion_point = vma; + tail_vma->vm_next = NULL; + mm->mmap_cache = NULL; /* Kill the cache. */ +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the the tail. + */ +int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, + unsigned long addr, int new_below) +{ + struct vm_area_struct *new; + + if (mm->map_count >= MAX_MAP_COUNT) + return -ENOMEM; + + new = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!new) + return -ENOMEM; + + /* most fields are the same, copy all, and then fixup */ + *new = *vma; + + INIT_LIST_HEAD(&new->shared); + + if (new_below) { + new->vm_end = addr; + vma->vm_start = addr; + vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + } else { + vma->vm_end = addr; + new->vm_start = addr; + new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); + } + + if (new->vm_file) + get_file(new->vm_file); + + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + + insert_vm_struct(mm, new); + return 0; +} + +/* Munmap is split into 2 main parts -- this part which finds + * what needs doing, and the areas themselves, which do the + * work. This now handles partial unmappings. + * Jeremy Fitzhardinge <jeremy@goop.org> + */ +int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + unsigned long end; + struct vm_area_struct *mpnt, *prev, *last; + + if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + if ((len = PAGE_ALIGN(len)) == 0) + return -EINVAL; + + /* Find the first overlapping VMA */ + mpnt = find_vma_prev(mm, start, &prev); + if (!mpnt) + return 0; + /* we have start < mpnt->vm_end */ + + if (is_vm_hugetlb_page(mpnt)) { + int ret = is_aligned_hugepage_range(start, len); + + if (ret) + return ret; + } + + /* if it doesn't overlap, we have nothing.. */ + end = start + len; + if (mpnt->vm_start >= end) + return 0; + + /* Something will probably happen, so notify. */ + if (mpnt->vm_file && (mpnt->vm_flags & VM_EXEC)) + profile_exec_unmap(mm); + + /* + * If we need to split any vma, do it now to save pain later. + * + * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially + * unmapped vm_area_struct will remain in use: so lower split_vma + * places tmp vma above, and higher split_vma places tmp vma below. + */ + if (start > mpnt->vm_start) { + if (split_vma(mm, mpnt, start, 0)) + return -ENOMEM; + prev = mpnt; + } + + /* Does it split the last one? */ + last = find_vma(mm, end); + if (last && end > last->vm_start) { + if (split_vma(mm, last, end, 1)) + return -ENOMEM; + } + mpnt = prev? prev->vm_next: mm->mmap; + + /* + * Remove the vma's, and unmap the actual pages + */ + spin_lock(&mm->page_table_lock); + detach_vmas_to_be_unmapped(mm, mpnt, prev, end); + unmap_region(mm, mpnt, prev, start, end); + spin_unlock(&mm->page_table_lock); + + /* Fix up all other VM information */ + unmap_vma_list(mm, mpnt); + + return 0; +} + +EXPORT_SYMBOL(do_munmap); + +asmlinkage long sys_munmap(unsigned long addr, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + + down_write(&mm->mmap_sem); + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; +} + +/* + * this is really a simplified "do_mmap". it only handles + * anonymous maps. eventually we may be able to do some + * brk-specific accounting here. + */ +unsigned long do_brk(unsigned long addr, unsigned long len) +{ + struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + unsigned long flags; + struct rb_node ** rb_link, * rb_parent; + + len = PAGE_ALIGN(len); + if (!len) + return addr; + + if ((addr + len) > TASK_SIZE || (addr + len) < addr) + return -EINVAL; + + /* + * mlock MCL_FUTURE? + */ + if (mm->def_flags & VM_LOCKED) { + unsigned long locked = mm->locked_vm << PAGE_SHIFT; + locked += len; + if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) + return -EAGAIN; + } + + /* + * Clear old maps. this also does some error checking for us + */ + munmap_back: + vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); + if (vma && vma->vm_start < addr + len) { + if (do_munmap(mm, addr, len)) + return -ENOMEM; + goto munmap_back; + } + + /* Check against address space limits *after* clearing old maps... */ + if ((mm->total_vm << PAGE_SHIFT) + len + > current->rlim[RLIMIT_AS].rlim_cur) + return -ENOMEM; + + if (mm->map_count > MAX_MAP_COUNT) + return -ENOMEM; + + if (security_vm_enough_memory(len >> PAGE_SHIFT)) + return -ENOMEM; + + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + /* Can we just expand an old anonymous mapping? */ + if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, + flags, NULL, 0)) + goto out; + + /* + * create a vma struct for an anonymous mapping + */ + vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + if (!vma) { + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + } + + vma->vm_mm = mm; + vma->vm_start = addr; + vma->vm_end = addr + len; + vma->vm_flags = flags; + vma->vm_page_prot = protection_map[flags & 0x0f]; + vma->vm_ops = NULL; + vma->vm_pgoff = 0; + vma->vm_file = NULL; + vma->vm_private_data = NULL; + INIT_LIST_HEAD(&vma->shared); + + vma_link(mm, vma, prev, rb_link, rb_parent); + +out: + mm->total_vm += len >> PAGE_SHIFT; + if (flags & VM_LOCKED) { + mm->locked_vm += len >> PAGE_SHIFT; + make_pages_present(addr, addr + len); + } + return addr; +} + +EXPORT_SYMBOL(do_brk); + +/* Build the RB tree corresponding to the VMA list. */ +void build_mmap_rb(struct mm_struct * mm) +{ + struct vm_area_struct * vma; + struct rb_node ** rb_link, * rb_parent; + + mm->mm_rb = RB_ROOT; + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + __vma_link_rb(mm, vma, rb_link, rb_parent); + rb_parent = &vma->vm_rb; + rb_link = &rb_parent->rb_right; + } +} + +/* Release all mmaps. */ +void exit_mmap(struct mm_struct *mm) +{ + struct mmu_gather *tlb; + struct vm_area_struct *vma; + unsigned long nr_accounted = 0; + + profile_exit_mmap(mm); + + lru_add_drain(); + + spin_lock(&mm->page_table_lock); + + tlb = tlb_gather_mmu(mm, 1); + flush_cache_mm(mm); + /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ + mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, + ~0UL, &nr_accounted); + vm_unacct_memory(nr_accounted); + BUG_ON(mm->map_count); /* This is just debugging */ + clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); + tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); + + vma = mm->mmap; + mm->mmap = mm->mmap_cache = NULL; + mm->mm_rb = RB_ROOT; + mm->rss = 0; + mm->total_vm = 0; + mm->locked_vm = 0; + + spin_unlock(&mm->page_table_lock); + + /* + * Walk the list again, actually closing and freeing it + * without holding any MM locks. + */ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + remove_shared_vm_struct(vma); + if (vma->vm_ops) { + if (vma->vm_ops->close) + vma->vm_ops->close(vma); + } + if (vma->vm_file) + fput(vma->vm_file); + kmem_cache_free(vm_area_cachep, vma); + vma = next; + } +} + +/* Insert vm structure into process list sorted by address + * and into the inode's i_mmap ring. If vm_file is non-NULL + * then i_shared_sem is taken here. + */ +void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + vma_link(mm, vma, prev, rb_link, rb_parent); + validate_mm(mm); +} diff -Nru a/mm/mprotect.c b/mm/mprotect.c --- a/mm/mprotect.c Fri Oct 31 14:10:53 2003 +++ b/mm/mprotect.c Fri Oct 31 14:10:53 2003 @@ -222,7 +222,8 @@ } asmlinkage long -sys_mprotect(unsigned long start, size_t len, unsigned long prot) +do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, + unsigned long prot) { unsigned long vm_flags, nstart, end, tmp; struct vm_area_struct * vma, * next, * prev; @@ -326,6 +327,11 @@ prev->vm_mm->map_count--; } out: - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); return error; +} + +asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + return(do_mprotect(current->mm, start, len, prot)); } diff -Nru a/mm/mprotect.c~do_mmap2-fix.diff b/mm/mprotect.c~do_mmap2-fix.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/mprotect.c~do_mmap2-fix.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,331 @@ +/* + * mm/mprotect.c + * + * (C) Copyright 1994 Linus Torvalds + * (C) Copyright 2002 Christoph Hellwig + * + * Address space accounting code <alan@redhat.com> + * (C) Copyright 2002 Red Hat Inc, All Rights Reserved + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <linux/slab.h> +#include <linux/shm.h> +#include <linux/mman.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/security.h> + +#include <asm/uaccess.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +static inline void +change_pte_range(pmd_t *pmd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pte_t * pte; + unsigned long end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + pte = pte_offset_map(pmd, address); + address &= ~PMD_MASK; + end = address + size; + if (end > PMD_SIZE) + end = PMD_SIZE; + do { + if (pte_present(*pte)) { + pte_t entry; + + /* Avoid an SMP race with hardware updated dirty/clean + * bits by wiping the pte and then setting the new pte + * into place. + */ + entry = ptep_get_and_clear(pte); + set_pte(pte, pte_modify(entry, newprot)); + } + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); + pte_unmap(pte - 1); +} + +static inline void +change_pmd_range(pgd_t *pgd, unsigned long address, + unsigned long size, pgprot_t newprot) +{ + pmd_t * pmd; + unsigned long end; + + if (pgd_none(*pgd)) + return; + if (pgd_bad(*pgd)) { + pgd_ERROR(*pgd); + pgd_clear(pgd); + return; + } + pmd = pmd_offset(pgd, address); + address &= ~PGDIR_MASK; + end = address + size; + if (end > PGDIR_SIZE) + end = PGDIR_SIZE; + do { + change_pte_range(pmd, address, end - address, newprot); + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +static void +change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot) +{ + pgd_t *dir; + unsigned long beg = start; + + dir = pgd_offset(current->mm, start); + flush_cache_range(vma, beg, end); + if (start >= end) + BUG(); + spin_lock(¤t->mm->page_table_lock); + do { + change_pmd_range(dir, start, end - start, newprot); + start = (start + PGDIR_SIZE) & PGDIR_MASK; + dir++; + } while (start && (start < end)); + flush_tlb_range(vma, beg, end); + spin_unlock(¤t->mm->page_table_lock); + return; +} +/* + * Try to merge a vma with the previous flag, return 1 if successful or 0 if it + * was impossible. + */ +static int +mprotect_attempt_merge(struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long end, int newflags) +{ + struct mm_struct * mm = vma->vm_mm; + + if (!prev || !vma) + return 0; + if (prev->vm_end != vma->vm_start) + return 0; + if (!can_vma_merge(prev, newflags)) + return 0; + if (vma->vm_file || (vma->vm_flags & VM_SHARED)) + return 0; + + /* + * If the whole area changes to the protection of the previous one + * we can just get rid of it. + */ + if (end == vma->vm_end) { + spin_lock(&mm->page_table_lock); + prev->vm_end = end; + __vma_unlink(mm, vma, prev); + spin_unlock(&mm->page_table_lock); + + kmem_cache_free(vm_area_cachep, vma); + mm->map_count--; + return 1; + } + + /* + * Otherwise extend it. + */ + spin_lock(&mm->page_table_lock); + prev->vm_end = end; + vma->vm_start = end; + spin_unlock(&mm->page_table_lock); + return 1; +} + +static int +mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, + unsigned long start, unsigned long end, unsigned int newflags) +{ + struct mm_struct * mm = vma->vm_mm; + unsigned long charged = 0; + pgprot_t newprot; + int error; + + if (newflags == vma->vm_flags) { + *pprev = vma; + return 0; + } + + /* + * If we make a private mapping writable we increase our commit; + * but (without finer accounting) cannot reduce our commit if we + * make it unwritable again. + * + * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting + * a MAP_NORESERVE private mapping to writable will now reserve. + */ + if (newflags & VM_WRITE) { + if (!(vma->vm_flags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { + charged = (end - start) >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + return -ENOMEM; + newflags |= VM_ACCOUNT; + } + } + + newprot = protection_map[newflags & 0xf]; + + if (start == vma->vm_start) { + /* + * Try to merge with the previous vma. + */ + if (mprotect_attempt_merge(vma, *pprev, end, newflags)) { + vma = *pprev; + goto success; + } + } else { + error = split_vma(mm, vma, start, 1); + if (error) + goto fail; + } + /* + * Unless it returns an error, this function always sets *pprev to + * the first vma for which vma->vm_end >= end. + */ + *pprev = vma; + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto fail; + } + + spin_lock(&mm->page_table_lock); + vma->vm_flags = newflags; + vma->vm_page_prot = newprot; + spin_unlock(&mm->page_table_lock); +success: + change_protection(vma, start, end, newprot); + return 0; + +fail: + vm_unacct_memory(charged); + return error; +} + +asmlinkage long +sys_mprotect(unsigned long start, size_t len, unsigned long prot) +{ + unsigned long vm_flags, nstart, end, tmp; + struct vm_area_struct * vma, * next, * prev; + int error = -EINVAL; + const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); + if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ + return -EINVAL; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = PAGE_ALIGN(len); + end = start + len; + if (end < start) + return -EINVAL; + if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) + return -EINVAL; + if (end == start) + return 0; + + vm_flags = calc_vm_prot_bits(prot); + + down_write(¤t->mm->mmap_sem); + + vma = find_vma_prev(current->mm, start, &prev); + error = -ENOMEM; + if (!vma) + goto out; + if (unlikely(grows & PROT_GROWSDOWN)) { + if (vma->vm_start >= end) + goto out; + start = vma->vm_start; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out; + } + else { + if (vma->vm_start > start) + goto out; + if (unlikely(grows & PROT_GROWSUP)) { + end = vma->vm_end; + error = -EINVAL; + if (!(vma->vm_flags & VM_GROWSUP)) + goto out; + } + } + + for (nstart = start ; ; ) { + unsigned int newflags; + int last = 0; + + /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ + + if (is_vm_hugetlb_page(vma)) { + error = -EACCES; + goto out; + } + + newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); + + if ((newflags & ~(newflags >> 4)) & 0xf) { + error = -EACCES; + goto out; + } + + error = security_file_mprotect(vma, prot); + if (error) + goto out; + + if (vma->vm_end > end) { + error = mprotect_fixup(vma, &prev, nstart, end, newflags); + goto out; + } + if (vma->vm_end == end) + last = 1; + + tmp = vma->vm_end; + next = vma->vm_next; + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); + if (error) + goto out; + if (last) + break; + nstart = tmp; + vma = next; + if (!vma || vma->vm_start != nstart) { + error = -ENOMEM; + goto out; + } + } + + if (next && prev->vm_end == next->vm_start && + can_vma_merge(next, prev->vm_flags) && + !prev->vm_file && !(prev->vm_flags & VM_SHARED)) { + spin_lock(&prev->vm_mm->page_table_lock); + prev->vm_end = next->vm_end; + __vma_unlink(prev->vm_mm, next, prev); + spin_unlock(&prev->vm_mm->page_table_lock); + + kmem_cache_free(vm_area_cachep, next); + prev->vm_mm->map_count--; + } +out: + up_write(¤t->mm->mmap_sem); + return error; +} diff -Nru a/mm/readahead.c b/mm/readahead.c --- a/mm/readahead.c Fri Oct 31 14:10:53 2003 +++ b/mm/readahead.c Fri Oct 31 14:10:53 2003 @@ -464,6 +464,7 @@ out: return; } +EXPORT_SYMBOL(page_cache_readahead); /* diff -Nru a/mm/readahead.c~export-page_cache_readahead.diff b/mm/readahead.c~export-page_cache_readahead.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/readahead.c~export-page_cache_readahead.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,524 @@ +/* + * mm/readahead.c - address_space-level file readahead. + * + * Copyright (C) 2002, Linus Torvalds + * + * 09Apr2002 akpm@zip.com.au + * Initial version. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> + +struct backing_dev_info default_backing_dev_info = { + .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, + .state = 0, +}; + +EXPORT_SYMBOL_GPL(default_backing_dev_info); + +/* + * Initialise a struct file's readahead state + */ +void +file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) +{ + memset(ra, 0, sizeof(*ra)); + ra->ra_pages = mapping->backing_dev_info->ra_pages; +} + +EXPORT_SYMBOL(file_ra_state_init); + +/* + * Return max readahead size for this inode in number-of-pages. + */ +static inline unsigned long get_max_readahead(struct file_ra_state *ra) +{ + return ra->ra_pages; +} + +static inline unsigned long get_min_readahead(struct file_ra_state *ra) +{ + return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; +} + +#define list_to_page(head) (list_entry((head)->prev, struct page, list)) + +/** + * read_cache_pages - populate an address space with some pages, and + * start reads against them. + * @mapping: the address_space + * @pages: The address of a list_head which contains the target pages. These + * pages have their ->index populated and are otherwise uninitialised. + * @filler: callback routine for filling a single page. + * @data: private data for the callback routine. + * + * Hides the details of the LRU cache etc from the filesystems. + */ +int read_cache_pages(struct address_space *mapping, struct list_head *pages, + int (*filler)(void *, struct page *), void *data) +{ + struct page *page; + struct pagevec lru_pvec; + int ret = 0; + + pagevec_init(&lru_pvec, 0); + + while (!list_empty(pages)) { + page = list_to_page(pages); + list_del(&page->list); + if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { + page_cache_release(page); + continue; + } + ret = filler(data, page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + if (ret) { + while (!list_empty(pages)) { + struct page *victim; + + victim = list_to_page(pages); + list_del(&victim->list); + page_cache_release(victim); + } + break; + } + } + pagevec_lru_add(&lru_pvec); + return ret; +} + +EXPORT_SYMBOL(read_cache_pages); + +static int read_pages(struct address_space *mapping, struct file *filp, + struct list_head *pages, unsigned nr_pages) +{ + unsigned page_idx; + struct pagevec lru_pvec; + int ret = 0; + + if (mapping->a_ops->readpages) { + ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); + goto out; + } + + pagevec_init(&lru_pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_to_page(pages); + list_del(&page->list); + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + mapping->a_ops->readpage(filp, page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + } else { + page_cache_release(page); + } + } + pagevec_lru_add(&lru_pvec); +out: + return ret; +} + +/* + * Readahead design. + * + * The fields in struct file_ra_state represent the most-recently-executed + * readahead attempt: + * + * start: Page index at which we started the readahead + * size: Number of pages in that read + * Together, these form the "current window". + * Together, start and size represent the `readahead window'. + * next_size: The number of pages to read on the next readahead miss. + * Has the magical value -1UL if readahead has been disabled. + * prev_page: The page which the readahead algorithm most-recently inspected. + * prev_page is mainly an optimisation: if page_cache_readahead + * sees that it is again being called for a page which it just + * looked at, it can return immediately without making any state + * changes. + * ahead_start, + * ahead_size: Together, these form the "ahead window". + * ra_pages: The externally controlled max readahead for this fd. + * + * When readahead is in the "maximally shrunk" state (next_size == -1UL), + * readahead is disabled. In this state, prev_page and size are used, inside + * handle_ra_miss(), to detect the resumption of sequential I/O. Once there + * has been a decent run of sequential I/O (defined by get_min_readahead), + * readahead is reenabled. + * + * The readahead code manages two windows - the "current" and the "ahead" + * windows. The intent is that while the application is walking the pages + * in the current window, I/O is underway on the ahead window. When the + * current window is fully traversed, it is replaced by the ahead window + * and the ahead window is invalidated. When this copying happens, the + * new current window's pages are probably still locked. When I/O has + * completed, we submit a new batch of I/O, creating a new ahead window. + * + * So: + * + * ----|----------------|----------------|----- + * ^start ^start+size + * ^ahead_start ^ahead_start+ahead_size + * + * ^ When this page is read, we submit I/O for the + * ahead window. + * + * A `readahead hit' occurs when a read request is made against a page which is + * inside the current window. Hits are good, and the window size (next_size) + * is grown aggressively when hits occur. Two pages are added to the next + * window size on each hit, which will end up doubling the next window size by + * the time I/O is submitted for it. + * + * If readahead hits are more sparse (say, the application is only reading + * every second page) then the window will build more slowly. + * + * On a readahead miss (the application seeked away) the readahead window is + * shrunk by 25%. We don't want to drop it too aggressively, because it is a + * good assumption that an application which has built a good readahead window + * will continue to perform linear reads. Either at the new file position, or + * at the old one after another seek. + * + * After enough misses, readahead is fully disabled. (next_size = -1UL). + * + * There is a special-case: if the first page which the application tries to + * read happens to be the first page of the file, it is assumed that a linear + * read is about to happen and the window is immediately set to half of the + * device maximum. + * + * A page request at (start + size) is not a miss at all - it's just a part of + * sequential file reading. + * + * This function is to be called for every page which is read, rather than when + * it is time to perform readahead. This is so the readahead algorithm can + * centrally work out the access patterns. This could be costly with many tiny + * read()s, so we specifically optimise for that case with prev_page. + */ + +/* + * do_page_cache_readahead actually reads a chunk of disk. It allocates all + * the pages first, then submits them all for I/O. This avoids the very bad + * behaviour which would occur if page allocations are causing VM writeback. + * We really don't want to intermingle reads and writes like that. + * + * Returns the number of pages which actually had IO started against them. + */ +static inline int +__do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read) +{ + struct inode *inode = mapping->host; + struct page *page; + unsigned long end_index; /* The last page we want to read */ + LIST_HEAD(page_pool); + int page_idx; + int ret = 0; + loff_t isize = i_size_read(inode); + + if (isize == 0) + goto out; + + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + /* + * Preallocate as many pages as we will need. + */ + spin_lock(&mapping->page_lock); + for (page_idx = 0; page_idx < nr_to_read; page_idx++) { + unsigned long page_offset = offset + page_idx; + + if (page_offset > end_index) + break; + + page = radix_tree_lookup(&mapping->page_tree, page_offset); + if (page) + continue; + + spin_unlock(&mapping->page_lock); + page = page_cache_alloc_cold(mapping); + spin_lock(&mapping->page_lock); + if (!page) + break; + page->index = page_offset; + list_add(&page->list, &page_pool); + ret++; + } + spin_unlock(&mapping->page_lock); + + /* + * Now start the IO. We ignore I/O errors - if the page is not + * uptodate then the caller will launch readpage again, and + * will then handle the error. + */ + if (ret) + read_pages(mapping, filp, &page_pool, ret); + BUG_ON(!list_empty(&page_pool)); +out: + return ret; +} + +/* + * Chunk the readahead into 2 megabyte units, so that we don't pin too much + * memory at once. + */ +int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read) +{ + int ret = 0; + + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) + return -EINVAL; + + while (nr_to_read) { + int err; + + unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE; + + if (this_chunk > nr_to_read) + this_chunk = nr_to_read; + err = __do_page_cache_readahead(mapping, filp, + offset, this_chunk); + if (err < 0) { + ret = err; + break; + } + ret += err; + offset += this_chunk; + nr_to_read -= this_chunk; + } + return ret; +} + +/* + * This version skips the IO if the queue is read-congested, and will tell the + * block layer to abandon the readahead if request allocation would block. + * + * force_page_cache_readahead() will ignore queue congestion and will block on + * request queues. + */ +int do_page_cache_readahead(struct address_space *mapping, struct file *filp, + unsigned long offset, unsigned long nr_to_read) +{ + if (!bdi_read_congested(mapping->backing_dev_info)) + return __do_page_cache_readahead(mapping, filp, + offset, nr_to_read); + return 0; +} + +/* + * Check how effective readahead is being. If the amount of started IO is + * less than expected then the file is partly or fully in pagecache and + * readahead isn't helping. Shrink the window. + * + * But don't shrink it too much - the application may read the same page + * occasionally. + */ +static inline void +check_ra_success(struct file_ra_state *ra, pgoff_t attempt, + pgoff_t actual, pgoff_t orig_next_size) +{ + if (actual == 0) { + if (orig_next_size > 1) { + ra->next_size = orig_next_size - 1; + if (ra->ahead_size) + ra->ahead_size = ra->next_size; + } else { + ra->next_size = -1UL; + ra->size = 0; + } + } +} + +/* + * page_cache_readahead is the main function. If performs the adaptive + * readahead window size management and submits the readahead I/O. + */ +void +page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, + struct file *filp, unsigned long offset) +{ + unsigned max; + unsigned min; + unsigned orig_next_size; + unsigned actual; + + /* + * Here we detect the case where the application is performing + * sub-page sized reads. We avoid doing extra work and bogusly + * perturbing the readahead window expansion logic. + * If next_size is zero, this is the very first read for this + * file handle, or the window is maximally shrunk. + */ + if (offset == ra->prev_page) { + if (ra->next_size != 0) + goto out; + } + + if (ra->next_size == -1UL) + goto out; /* Maximally shrunk */ + + max = get_max_readahead(ra); + if (max == 0) + goto out; /* No readahead */ + + min = get_min_readahead(ra); + orig_next_size = ra->next_size; + + if (ra->next_size == 0 && offset == 0) { + /* + * Special case - first read from first page. + * We'll assume it's a whole-file read, and + * grow the window fast. + */ + ra->next_size = max / 2; + goto do_io; + } + + ra->prev_page = offset; + + if (offset >= ra->start && offset <= (ra->start + ra->size)) { + /* + * A readahead hit. Either inside the window, or one + * page beyond the end. Expand the next readahead size. + */ + ra->next_size += 2; + } else { + /* + * A miss - lseek, pagefault, pread, etc. Shrink the readahead + * window. + */ + ra->next_size -= 2; + } + + if ((long)ra->next_size > (long)max) + ra->next_size = max; + if ((long)ra->next_size <= 0L) { + ra->next_size = -1UL; + ra->size = 0; + goto out; /* Readahead is off */ + } + + /* + * Is this request outside the current window? + */ + if (offset < ra->start || offset >= (ra->start + ra->size)) { + /* + * A miss against the current window. Have we merely + * advanced into the ahead window? + */ + if (offset == ra->ahead_start) { + /* + * Yes, we have. The ahead window now becomes + * the current window. + */ + ra->start = ra->ahead_start; + ra->size = ra->ahead_size; + ra->prev_page = ra->start; + ra->ahead_start = 0; + ra->ahead_size = 0; + + /* + * Control now returns, probably to sleep until I/O + * completes against the first ahead page. + * When the second page in the old ahead window is + * requested, control will return here and more I/O + * will be submitted to build the new ahead window. + */ + goto out; + } +do_io: + /* + * This is the "unusual" path. We come here during + * startup or after an lseek. We invalidate the + * ahead window and get some I/O underway for the new + * current window. + */ + ra->start = offset; + ra->size = ra->next_size; + ra->ahead_start = 0; /* Invalidate these */ + ra->ahead_size = 0; + actual = do_page_cache_readahead(mapping, filp, offset, + ra->size); + check_ra_success(ra, ra->size, actual, orig_next_size); + } else { + /* + * This read request is within the current window. It is time + * to submit I/O for the ahead window while the application is + * crunching through the current window. + */ + if (ra->ahead_start == 0) { + ra->ahead_start = ra->start + ra->size; + ra->ahead_size = ra->next_size; + actual = do_page_cache_readahead(mapping, filp, + ra->ahead_start, ra->ahead_size); + check_ra_success(ra, ra->ahead_size, + actual, orig_next_size); + } + } +out: + return; +} + + +/* + * handle_ra_miss() is called when it is known that a page which should have + * been present in the pagecache (we just did some readahead there) was in fact + * not found. This will happen if it was evicted by the VM (readahead + * thrashing) or if the readahead window is maximally shrunk. + * + * If the window has been maximally shrunk (next_size == -1UL) then look to see + * if we are getting misses against sequential file offsets. If so, and this + * persists then resume readahead. + * + * Otherwise we're thrashing, so shrink the readahead window by three pages. + * This is because it is grown by two pages on a readahead hit. Theory being + * that the readahead window size will stabilise around the maximum level at + * which there is no thrashing. + */ +void handle_ra_miss(struct address_space *mapping, + struct file_ra_state *ra, pgoff_t offset) +{ + if (ra->next_size == -1UL) { + const unsigned long max = get_max_readahead(ra); + + if (offset != ra->prev_page + 1) { + ra->size = 0; /* Not sequential */ + } else { + ra->size++; /* A sequential read */ + if (ra->size >= max) { /* Resume readahead */ + ra->start = offset - max; + ra->next_size = max; + ra->size = max; + ra->ahead_start = 0; + ra->ahead_size = 0; + } + } + ra->prev_page = offset; + } else { + const unsigned long min = get_min_readahead(ra); + + ra->next_size -= 3; + if (ra->next_size < min) + ra->next_size = min; + } +} + +/* + * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a + * sensible upper limit. + */ +unsigned long max_sane_readahead(unsigned long nr) +{ + unsigned long active; + unsigned long inactive; + unsigned long free; + + get_zone_counts(&active, &inactive, &free); + return min(nr, (inactive + free) / 2); +} diff -Nru a/mm/truncate.c b/mm/truncate.c --- a/mm/truncate.c Fri Oct 31 14:10:54 2003 +++ b/mm/truncate.c Fri Oct 31 14:10:54 2003 @@ -74,6 +74,9 @@ if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; + if (page->mapping != mapping) + return 0; + spin_lock(&mapping->page_lock); if (PageDirty(page)) { spin_unlock(&mapping->page_lock); @@ -177,6 +180,41 @@ } EXPORT_SYMBOL(truncate_inode_pages); + +void truncate_mapping_pages_range(struct address_space *mapping, + pgoff_t start, long count) +{ + const pgoff_t end = start + count - 1; + struct pagevec pvec; + int i; + pgoff_t next; + + pagevec_init(&pvec, 0); + next = start; + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + if (page_index > end) { + pagevec_release(&pvec); + return; + } + + lock_page(page); + wait_on_page_writeback(page); + + truncate_complete_page(mapping, page); + unlock_page(page); + + if (page_index > next) + next = page_index; + next++; + } + pagevec_release(&pvec); + cond_resched(); + } +} /** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode diff -Nru a/mm/truncate.c~truncate_mapping_pages_range.diff b/mm/truncate.c~truncate_mapping_pages_range.diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/mm/truncate.c~truncate_mapping_pages_range.diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,273 @@ +/* + * mm/truncate.c - code for taking down pages from address_spaces + * + * Copyright (C) 2002, Linus Torvalds + * + * 10Sep2002 akpm@zip.com.au + * Initial version. + */ + +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/buffer_head.h> /* grr. try_to_release_page, + block_invalidatepage */ + + +static int do_invalidatepage(struct page *page, unsigned long offset) +{ + int (*invalidatepage)(struct page *, unsigned long); + invalidatepage = page->mapping->a_ops->invalidatepage; + if (invalidatepage == NULL) + invalidatepage = block_invalidatepage; + return (*invalidatepage)(page, offset); +} + +static inline void truncate_partial_page(struct page *page, unsigned partial) +{ + memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); + if (PagePrivate(page)) + do_invalidatepage(page, partial); +} + +/* + * If truncate cannot remove the fs-private metadata from the page, the page + * becomes anonymous. It will be left on the LRU and may even be mapped into + * user pagetables if we're racing with filemap_nopage(). + * + * We need to bale out if page->mapping is no longer equal to the original + * mapping. This happens a) when the VM reclaimed the page while we waited on + * its lock, b) when a concurrent invalidate_inode_pages got there first and + * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. + */ +static void +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + if (PagePrivate(page)) + do_invalidatepage(page, 0); + + clear_page_dirty(page); + ClearPageUptodate(page); + ClearPageMappedToDisk(page); + remove_from_page_cache(page); + page_cache_release(page); /* pagecache ref */ +} + +/* + * This is for invalidate_inode_pages(). That function can be called at + * any time, and is not supposed to throw away dirty pages. But pages can + * be marked dirty at any time too. So we re-check the dirtiness inside + * ->page_lock. That provides exclusion against the __set_page_dirty + * functions. + */ +static int +invalidate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return 0; + + if (PagePrivate(page) && !try_to_release_page(page, 0)) + return 0; + + spin_lock(&mapping->page_lock); + if (PageDirty(page)) { + spin_unlock(&mapping->page_lock); + return 0; + } + __remove_from_page_cache(page); + spin_unlock(&mapping->page_lock); + ClearPageUptodate(page); + page_cache_release(page); /* pagecache ref */ + return 1; +} + +/** + * truncate_inode_pages - truncate *all* the pages from an offset + * @mapping: mapping to truncate + * @lstart: offset from which to truncate + * + * Truncate the page cache at a set offset, removing the pages that are beyond + * that offset (and zeroing out partial pages). + * + * Truncate takes two passes - the first pass is nonblocking. It will not + * block on page locks and it will not block on writeback. The second pass + * will wait. This is to prevent as much IO as possible in the affected region. + * The first pass will remove most pages, so the search cost of the second pass + * is low. + * + * When looking at page->index outside the page lock we need to be careful to + * copy it into a local to avoid races (it could change at any time). + * + * We pass down the cache-hot hint to the page freeing code. Even if the + * mapping is large, it is probably the case that the final pages are the most + * recently touched, and freeing happens in ascending file offset order. + * + * Called under (and serialised by) inode->i_sem. + */ +void truncate_inode_pages(struct address_space *mapping, loff_t lstart) +{ + const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + struct pagevec pvec; + pgoff_t next; + int i; + + if (mapping->nrpages == 0) + return; + + pagevec_init(&pvec, 0); + next = start; + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + + if (page_index > next) + next = page_index; + next++; + if (TestSetPageLocked(page)) + continue; + if (PageWriteback(page)) { + unlock_page(page); + continue; + } + truncate_complete_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + + if (partial) { + struct page *page = find_lock_page(mapping, start - 1); + if (page) { + wait_on_page_writeback(page); + truncate_partial_page(page, partial); + unlock_page(page); + page_cache_release(page); + } + } + + next = start; + for ( ; ; ) { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + if (next == start) + break; + next = start; + continue; + } + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + wait_on_page_writeback(page); + if (page->index > next) + next = page->index; + next++; + truncate_complete_page(mapping, page); + unlock_page(page); + } + pagevec_release(&pvec); + } +} + +EXPORT_SYMBOL(truncate_inode_pages); + +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + pgoff_t next = start; + unsigned long ret = 0; + int i; + + pagevec_init(&pvec, 0); + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + if (TestSetPageLocked(page)) { + next++; + continue; + } + if (page->index > next) + next = page->index; + next++; + if (PageDirty(page) || PageWriteback(page)) + goto unlock; + if (page_mapped(page)) + goto unlock; + ret += invalidate_complete_page(mapping, page); +unlock: + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return ret; +} + +unsigned long invalidate_inode_pages(struct address_space *mapping) +{ + return invalidate_mapping_pages(mapping, 0, ~0UL); +} + +EXPORT_SYMBOL(invalidate_inode_pages); + +/** + * invalidate_inode_pages2 - remove all unmapped pages from an address_space + * @mapping - the address_space + * + * invalidate_inode_pages2() is like truncate_inode_pages(), except for the case + * where the page is seen to be mapped into process pagetables. In that case, + * the page is marked clean but is left attached to its address_space. + * + * FIXME: invalidate_inode_pages2() is probably trivially livelockable. + */ +void invalidate_inode_pages2(struct address_space *mapping) +{ + struct pagevec pvec; + pgoff_t next = 0; + int i; + + pagevec_init(&pvec, 0); + while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + + lock_page(page); + if (page->mapping == mapping) { /* truncate race? */ + wait_on_page_writeback(page); + next = page->index + 1; + if (page_mapped(page)) + clear_page_dirty(page); + else + invalidate_complete_page(mapping, page); + } + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } +} + +EXPORT_SYMBOL_GPL(invalidate_inode_pages2); diff -Nru a/mm/vmscan.c b/mm/vmscan.c --- a/mm/vmscan.c Fri Oct 31 14:10:53 2003 +++ b/mm/vmscan.c Fri Oct 31 14:10:53 2003 @@ -412,6 +412,11 @@ if (PagePrivate(page)) { if (!try_to_release_page(page, gfp_mask)) goto activate_locked; + /* + * file system may manually remove page from the page + * cache in ->releasepage(). Check for this. + */ + mapping = page->mapping; if (!mapping && page_count(page) == 1) goto free_it; } diff -Nru a/net/Kconfig b/net/Kconfig --- a/net/Kconfig Fri Oct 31 14:10:53 2003 +++ b/net/Kconfig Fri Oct 31 14:10:53 2003 @@ -96,15 +96,9 @@ allows you to ping yourself (great fun, that!). For an excellent introduction to Linux networking, please read the - NET-3-HOWTO, available from + Linux Networking HOWTO, available from <http://www.tldp.org/docs.html#howto>. - This option is also necessary if you want to use the full power of - term (term is a program which gives you almost full Internet - connectivity if you have a regular dial up shell account on some - Internet connected Unix computer; for more information, read - <http://www.bart.nl/~patrickr/term-howto/Term-HOWTO.html>). - If you say Y here and also to "/proc file system support" and "Sysctl support" below, you can change various aspects of the behavior of the TCP/IP code by writing to the (virtual) files in @@ -120,8 +114,10 @@ tristate "The IPv6 protocol (EXPERIMENTAL)" depends on INET && EXPERIMENTAL ---help--- - This is experimental support for the next version of the Internet - Protocol: IP version 6 (also called IPng "IP next generation"). + This is experimental support for the IP version 6 (formerly called + IPng "IP next generation"). You will still be able to do + regular IPv4 networking as well. + Features of this new protocol include: expanded address space, authentication and privacy, and seamless interoperability with the current version of IP (IP version 4). For general information about @@ -130,12 +126,10 @@ <http://www.bieringer.de/linux/IPv6/> and the file net/ipv6/README in the kernel source. - If you want to use IPv6, please upgrade to the newest net-tools as - given in <file:Documentation/Changes>. You will still be able to do - regular IPv4 networking as well. - - To compile this protocol support as a module, choose M here: the - module will be called ipv6. + To compile this protocol support as a module, choose M here: the + module will be called ipv6. If you try building this as a module + and you have said Y to "Kernel module loader support" above, + be sure to add 'alias net-pf-10 ipv6' to your /etc/modules.conf file. It is safe to say N here for now. diff -Nru a/net/compat.c b/net/compat.c --- a/net/compat.c Fri Oct 31 14:10:54 2003 +++ b/net/compat.c Fri Oct 31 14:10:54 2003 @@ -322,7 +322,7 @@ u32 origsize, tmp32, num_counters; unsigned int repl_nat_size; int ret; - int i, num_ents; + int i; compat_uptr_t ucntrs; if (get_user(origsize, &urepl->size)) @@ -366,15 +366,10 @@ __put_user(compat_ptr(ucntrs), &repl_nat->counters)) goto out; - num_ents = origsize / sizeof(struct ipt_entry); - - for (i = 0; i < num_ents; i++) { - struct ipt_entry ent; - - if (__copy_from_user(&ent, &urepl->entries[i], sizeof(ent)) || - __copy_to_user(&repl_nat->entries[i], &ent, sizeof(ent))) - goto out; - } + if (__copy_in_user(&repl_nat->entries[0], + &urepl->entries[0], + origsize)) + goto out; for (i = 0; i < NF_IP_NUMHOOKS; i++) { if (__get_user(tmp32, &urepl->hook_entry[i]) || diff -Nru a/net/core/dev.c b/net/core/dev.c --- a/net/core/dev.c Fri Oct 31 14:10:54 2003 +++ b/net/core/dev.c Fri Oct 31 14:10:54 2003 @@ -550,6 +550,32 @@ return dev; } +struct net_device *__dev_getfirstbyhwtype(unsigned short type) +{ + struct net_device *dev; + + for (dev = dev_base; dev; dev = dev->next) + if (dev->type == type) + break; + return dev; +} + +EXPORT_SYMBOL(__dev_getfirstbyhwtype); + +struct net_device *dev_getfirstbyhwtype(unsigned short type) +{ + struct net_device *dev; + + rtnl_lock(); + dev = __dev_getfirstbyhwtype(type); + if (dev) + dev_hold(dev); + rtnl_unlock(); + return dev; +} + +EXPORT_SYMBOL(dev_getfirstbyhwtype); + /** * dev_get_by_flags - find any device with given flags * @if_flags: IFF_* values @@ -3023,7 +3049,7 @@ return rc; } -subsys_initcall(net_dev_init); +fs_initcall(net_dev_init); EXPORT_SYMBOL(__dev_get); EXPORT_SYMBOL(__dev_get_by_flags); diff -Nru a/net/core/skbuff.c b/net/core/skbuff.c --- a/net/core/skbuff.c Fri Oct 31 14:10:54 2003 +++ b/net/core/skbuff.c Fri Oct 31 14:10:54 2003 @@ -583,6 +583,8 @@ */ struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, gfp_mask); + int head_copy_len, head_copy_off; + if (!n) return NULL; @@ -591,8 +593,16 @@ /* Set the tail pointer and length */ skb_put(n, skb->len); - /* Copy the data only. */ - if (skb_copy_bits(skb, 0, n->data, skb->len)) + head_copy_len = skb_headroom(skb); + head_copy_off = 0; + if (newheadroom < head_copy_len) { + head_copy_off = head_copy_len - newheadroom; + head_copy_len = newheadroom; + } + + /* Copy the linear header and data. */ + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)) BUG(); copy_skb_header(n, skb); diff -Nru a/net/core/sock.c b/net/core/sock.c --- a/net/core/sock.c Fri Oct 31 14:10:54 2003 +++ b/net/core/sock.c Fri Oct 31 14:10:54 2003 @@ -154,8 +154,14 @@ static void sock_warn_obsolete_bsdism(const char *name) { - printk(KERN_WARNING "process `%s' is using obsolete " - "%s SO_BSDCOMPAT\n", current->comm, name); + static int warned; + static char warncomm[16]; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING "process `%s' is using obsolete " + "%s SO_BSDCOMPAT\n", warncomm, name); + warned++; + } } /* diff -Nru a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c --- a/net/core/sysctl_net_core.c Fri Oct 31 14:10:53 2003 +++ b/net/core/sysctl_net_core.c Fri Oct 31 14:10:53 2003 @@ -29,6 +29,7 @@ extern int sysctl_core_destroy_delay; extern int sysctl_optmem_max; +extern int sysctl_somaxconn; #ifdef CONFIG_NET_DIVERT extern char sysctl_divert_version[]; @@ -174,6 +175,14 @@ }, #endif /* CONFIG_NET_DIVERT */ #endif /* CONFIG_NET */ + { + .ctl_name = NET_CORE_SOMAXCONN, + .procname = "somaxconn", + .data = &sysctl_somaxconn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; diff -Nru a/net/ipv4/ah4.c b/net/ipv4/ah4.c --- a/net/ipv4/ah4.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv4/ah4.c Fri Oct 31 14:10:54 2003 @@ -245,6 +245,9 @@ struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; + if (!x->aalg) + goto error; + /* null auth can use a zero length key */ if (x->aalg->alg_key_len > 512) goto error; diff -Nru a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c --- a/net/ipv4/ip_gre.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv4/ip_gre.c Fri Oct 31 14:10:54 2003 @@ -276,6 +276,8 @@ return NULL; dev->init = ipgre_tunnel_init; + nt = dev->priv; + nt->parms = *parms; if (register_netdevice(dev) < 0) { kfree(dev); diff -Nru a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c --- a/net/ipv4/ipcomp.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv4/ipcomp.c Fri Oct 31 14:10:54 2003 @@ -344,10 +344,15 @@ static int ipcomp_init_state(struct xfrm_state *x, void *args) { - int err = -ENOMEM; + int err; struct ipcomp_data *ipcd; struct xfrm_algo_desc *calg_desc; + err = -EINVAL; + if (!x->calg) + goto out; + + err = -ENOMEM; ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); if (!ipcd) goto error; diff -Nru a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c --- a/net/ipv4/ipmr.c Fri Oct 31 14:10:53 2003 +++ b/net/ipv4/ipmr.c Fri Oct 31 14:10:53 2003 @@ -1124,18 +1124,16 @@ * Processing handlers for ipmr_forward */ -static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, - int vifi, int last) +static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) { struct iphdr *iph = skb->nh.iph; struct vif_device *vif = &vif_table[vifi]; struct net_device *dev; struct rtable *rt; int encap = 0; - struct sk_buff *skb2; if (vif->dev == NULL) - return; + goto out_free; #ifdef CONFIG_IP_PIMSM if (vif->flags & VIFF_REGISTER) { @@ -1144,6 +1142,7 @@ ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len; ((struct net_device_stats*)vif->dev->priv)->tx_packets++; ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); + kfree_skb(skb); return; } #endif @@ -1156,7 +1155,7 @@ .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; if (ip_route_output_key(&rt, &fl)) - return; + goto out_free; encap = sizeof(struct iphdr); } else { struct flowi fl = { .oif = vif->link, @@ -1165,7 +1164,7 @@ .tos = RT_TOS(iph->tos) } }, .proto = IPPROTO_IPIP }; if (ip_route_output_key(&rt, &fl)) - return; + goto out_free; } dev = rt->u.dst.dev; @@ -1178,43 +1177,34 @@ IP_INC_STATS_BH(IpFragFails); ip_rt_put(rt); - return; + goto out_free; } - encap += LL_RESERVED_SPACE(dev); + encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; - if (skb_headroom(skb) < encap || skb_cloned(skb) || !last) - skb2 = skb_realloc_headroom(skb, (encap + 15)&~15); - else if (atomic_read(&skb->users) != 1) - skb2 = skb_clone(skb, GFP_ATOMIC); - else { - atomic_inc(&skb->users); - skb2 = skb; - } - - if (skb2 == NULL) { - ip_rt_put(rt); - return; + if (skb_cow(skb, encap)) { + ip_rt_put(rt); + goto out_free; } vif->pkt_out++; vif->bytes_out+=skb->len; - dst_release(skb2->dst); - skb2->dst = &rt->u.dst; - iph = skb2->nh.iph; + dst_release(skb->dst); + skb->dst = &rt->u.dst; + iph = skb->nh.iph; ip_decrease_ttl(iph); /* FIXME: forward and output firewalls used to be called here. * What do we do with netfilter? -- RR */ if (vif->flags & VIFF_TUNNEL) { - ip_encap(skb2, vif->local, vif->remote); + ip_encap(skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++; - ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb2->len; + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len; } - IPCB(skb2)->flags |= IPSKB_FORWARDED; + IPCB(skb)->flags |= IPSKB_FORWARDED; /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally @@ -1227,8 +1217,13 @@ * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ - NF_HOOK(PF_INET, NF_IP_FORWARD, skb2, skb->dev, dev, + NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, ipmr_forward_finish); + return; + +out_free: + kfree_skb(skb); + return; } static int ipmr_find_vif(struct net_device *dev) @@ -1299,13 +1294,24 @@ */ for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) { - if (psend != -1) - ipmr_queue_xmit(skb, cache, psend, 0); + if (psend != -1) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_xmit(skb2, cache, psend); + } psend=ct; } } - if (psend != -1) - ipmr_queue_xmit(skb, cache, psend, !local); + if (psend != -1) { + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_xmit(skb2, cache, psend); + } else { + ipmr_queue_xmit(skb, cache, psend); + return 0; + } + } dont_forward: if (!local) diff -Nru a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig --- a/net/ipv4/netfilter/Kconfig Fri Oct 31 14:10:54 2003 +++ b/net/ipv4/netfilter/Kconfig Fri Oct 31 14:10:54 2003 @@ -267,7 +267,7 @@ config IP_NF_MATCH_PHYSDEV tristate "Physdev match support" - depends on IP_NF_IPTABLES!=n && BRIDGE_NETFILTER + depends on IP_NF_IPTABLES && BRIDGE_NETFILTER help Physdev packet matching matches against the physical bridge ports the IP packet arrived on or will leave by. diff -Nru a/net/ipv4/netfilter/ip_fw_compat_masq.c b/net/ipv4/netfilter/ip_fw_compat_masq.c --- a/net/ipv4/netfilter/ip_fw_compat_masq.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv4/netfilter/ip_fw_compat_masq.c Fri Oct 31 14:10:54 2003 @@ -91,9 +91,6 @@ WRITE_UNLOCK(&ip_nat_lock); return ret; } - - place_in_hashes(ct, info); - info->initialized = 1; } else DEBUGP("Masquerading already done on this conn.\n"); WRITE_UNLOCK(&ip_nat_lock); diff -Nru a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c --- a/net/ipv4/netfilter/ipt_REDIRECT.c Fri Oct 31 14:10:53 2003 +++ b/net/ipv4/netfilter/ipt_REDIRECT.c Fri Oct 31 14:10:53 2003 @@ -83,7 +83,7 @@ /* Device might not have an associated in_device. */ indev = (struct in_device *)(*pskb)->dev->ip_ptr; - if (indev == NULL) + if (indev == NULL || indev->ifa_list == NULL) return NF_DROP; /* Grab first address on interface. */ diff -Nru a/net/ipv4/tcp.c b/net/ipv4/tcp.c --- a/net/ipv4/tcp.c Fri Oct 31 14:10:53 2003 +++ b/net/ipv4/tcp.c Fri Oct 31 14:10:53 2003 @@ -1540,6 +1540,17 @@ if (copied && tp->urg_data && tp->urg_seq == *seq) break; + /* We need to check signals first, to get correct SIGURG + * handling. FIXME: Need to check this doesn't impact 1003.1g + * and move it down to the bottom of the loop + */ + if (signal_pending(current)) { + if (copied) + break; + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + /* Next get a buffer. */ skb = skb_peek(&sk->sk_receive_queue); @@ -1576,7 +1587,6 @@ sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || - signal_pending(current) || (flags & MSG_PEEK)) break; } else { @@ -1604,11 +1614,6 @@ if (!timeo) { copied = -EAGAIN; - break; - } - - if (signal_pending(current)) { - copied = sock_intr_errno(timeo); break; } } diff -Nru a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c --- a/net/ipv4/tcp_input.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv4/tcp_input.c Fri Oct 31 14:10:54 2003 @@ -1967,7 +1967,10 @@ struct sk_buff *skb, u32 ack, u32 ack_seq) { int flag = 0; - u32 nwin = ntohs(skb->h.th->window) << tp->snd_wscale; + u32 nwin = ntohs(skb->h.th->window); + + if (likely(!skb->h.th->syn)) + nwin <<= tp->snd_wscale; if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { flag |= FLAG_WIN_UPDATE; diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c --- a/net/ipv4/tcp_ipv4.c Fri Oct 31 14:10:53 2003 +++ b/net/ipv4/tcp_ipv4.c Fri Oct 31 14:10:53 2003 @@ -178,12 +178,6 @@ tcp_sk(sk)->bind_hash = tb; } -static inline const u32 tcp_v4_rcv_saddr(const struct sock *sk) -{ - return likely(sk->sk_state != TCP_TIME_WAIT) ? - inet_sk(sk)->rcv_saddr : tcptw_sk(sk)->tw_rcv_saddr; -} - static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) { const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); diff -Nru a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c --- a/net/ipv4/tcp_minisocks.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv4/tcp_minisocks.c Fri Oct 31 14:10:54 2003 @@ -368,6 +368,11 @@ ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_v6_ipv6only = np->ipv6only; + } else { + memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr)); + memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr)); + tw->tw_v6_ipv6only = 0; } #endif /* Linkage updates. */ diff -Nru a/net/ipv4/udp.c b/net/ipv4/udp.c --- a/net/ipv4/udp.c Fri Oct 31 14:10:53 2003 +++ b/net/ipv4/udp.c Fri Oct 31 14:10:53 2003 @@ -398,6 +398,8 @@ */ static int udp_push_pending_frames(struct sock *sk, struct udp_opt *up) { + struct inet_opt *inet = inet_sk(sk); + struct flowi *fl = &inet->cork.fl; struct sk_buff *skb; struct udphdr *uh; int err = 0; @@ -410,8 +412,8 @@ * Create a UDP header */ uh = skb->h.uh; - uh->source = up->sport; - uh->dest = up->dport; + uh->source = fl->fl_ip_sport; + uh->dest = fl->fl_ip_dport; uh->len = htons(up->len); uh->check = 0; @@ -426,12 +428,12 @@ */ if (skb->ip_summed == CHECKSUM_HW) { skb->csum = offsetof(struct udphdr, check); - uh->check = ~csum_tcpudp_magic(up->saddr, up->daddr, + uh->check = ~csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, IPPROTO_UDP, 0); } else { skb->csum = csum_partial((char *)uh, sizeof(struct udphdr), skb->csum); - uh->check = csum_tcpudp_magic(up->saddr, up->daddr, + uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, IPPROTO_UDP, skb->csum); if (uh->check == 0) uh->check = -1; @@ -456,7 +458,7 @@ skb_queue_walk(&sk->sk_write_queue, skb) { csum = csum_add(csum, skb->csum); } - uh->check = csum_tcpudp_magic(up->saddr, up->daddr, + uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = -1; @@ -520,8 +522,13 @@ * The socket lock must be held while it's corked. */ lock_sock(sk); - if (likely(up->pending)) + if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET)) { + release_sock(sk); + return -EINVAL; + } goto do_append_data; + } release_sock(sk); } ulen += sizeof(struct udphdr); @@ -636,11 +643,11 @@ /* * Now cork the socket to pend data. */ - up->daddr = daddr; - up->dport = dport; - up->saddr = saddr; - up->sport = inet->sport; - up->pending = 1; + inet->cork.fl.fl4_dst = daddr; + inet->cork.fl.fl_ip_dport = dport; + inet->cork.fl.fl4_src = saddr; + inet->cork.fl.fl_ip_sport = inet->sport; + up->pending = AF_INET; do_append_data: up->len += ulen; diff -Nru a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c --- a/net/ipv6/addrconf.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv6/addrconf.c Fri Oct 31 14:10:54 2003 @@ -970,36 +970,33 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { - struct ipv6_pinfo *np = inet6_sk(sk); - int addr_type = ipv6_addr_type(&np->rcv_saddr); + const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; + const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); + u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; + u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + int sk_ipv6only = ipv6_only_sock(sk); + int sk2_ipv6only = tcp_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(sk_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; - if (!inet_sk(sk2)->rcv_saddr && !ipv6_only_sock(sk)) + if (!sk2_rcv_saddr && !sk_ipv6only) return 1; - if (sk2->sk_family == AF_INET6 && - ipv6_addr_any(&inet6_sk(sk2)->rcv_saddr) && - !(ipv6_only_sock(sk2) && addr_type == IPV6_ADDR_MAPPED)) + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return 1; if (addr_type == IPV6_ADDR_ANY && - (!ipv6_only_sock(sk) || - !(sk2->sk_family == AF_INET6 ? - (ipv6_addr_type(&inet6_sk(sk2)->rcv_saddr) == IPV6_ADDR_MAPPED) : - 1))) + !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) return 1; - if (sk2->sk_family == AF_INET6 && - !ipv6_addr_cmp(&np->rcv_saddr, - (sk2->sk_state != TCP_TIME_WAIT ? - &inet6_sk(sk2)->rcv_saddr : - &tcptw_sk(sk)->tw_v6_rcv_saddr))) + if (sk2_rcv_saddr6 && + !ipv6_addr_cmp(sk_rcv_saddr6, sk2_rcv_saddr6)) return 1; if (addr_type == IPV6_ADDR_MAPPED && - !ipv6_only_sock(sk2) && - (!inet_sk(sk2)->rcv_saddr || - !inet_sk(sk)->rcv_saddr || - inet_sk(sk)->rcv_saddr == inet_sk(sk2)->rcv_saddr)) + !sk2_ipv6only && + (!sk2_rcv_saddr || !sk_rcv_saddr || sk_rcv_saddr == sk2_rcv_saddr)) return 1; return 0; @@ -1109,24 +1106,22 @@ static int __ipv6_regen_rndid(struct inet6_dev *idev) { struct net_device *dev; - u8 eui64[8]; - u8 digest[16]; struct scatterlist sg[2]; sg[0].page = virt_to_page(idev->entropy); sg[0].offset = offset_in_page(idev->entropy); sg[0].length = 8; - sg[1].page = virt_to_page(eui64); - sg[1].offset = offset_in_page(eui64); + sg[1].page = virt_to_page(idev->work_eui64); + sg[1].offset = offset_in_page(idev->work_eui64); sg[1].length = 8; dev = idev->dev; - if (ipv6_generate_eui64(eui64, dev)) { + if (ipv6_generate_eui64(idev->work_eui64, dev)) { printk(KERN_INFO "__ipv6_regen_rndid(idev=%p): cannot get EUI64 identifier; use random bytes.\n", idev); - get_random_bytes(eui64, sizeof(eui64)); + get_random_bytes(idev->work_eui64, sizeof(idev->work_eui64)); } regen: spin_lock(&md5_tfm_lock); @@ -1136,12 +1131,12 @@ } crypto_digest_init(md5_tfm); crypto_digest_update(md5_tfm, sg, 2); - crypto_digest_final(md5_tfm, digest); + crypto_digest_final(md5_tfm, idev->work_digest); spin_unlock(&md5_tfm_lock); - memcpy(idev->rndid, &digest[0], 8); + memcpy(idev->rndid, &idev->work_digest[0], 8); idev->rndid[0] &= ~0x02; - memcpy(idev->entropy, &digest[8], 8); + memcpy(idev->entropy, &idev->work_digest[8], 8); /* * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>: diff -Nru a/net/ipv6/ah6.c b/net/ipv6/ah6.c --- a/net/ipv6/ah6.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv6/ah6.c Fri Oct 31 14:10:54 2003 @@ -380,6 +380,9 @@ struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; + if (!x->aalg) + goto error; + /* null auth can use a zero length key */ if (x->aalg->alg_key_len > 512) goto error; diff -Nru a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c --- a/net/ipv6/ip6_output.c Fri Oct 31 14:10:53 2003 +++ b/net/ipv6/ip6_output.c Fri Oct 31 14:10:53 2003 @@ -1239,7 +1239,7 @@ } dst_hold(&rt->u.dst); np->cork.rt = rt; - np->cork.fl = fl; + inet->cork.fl = *fl; np->cork.hop_limit = hlimit; inet->cork.fragsize = mtu = dst_pmtu(&rt->u.dst); inet->cork.length = 0; @@ -1250,6 +1250,7 @@ transhdrlen += exthdrlen; } else { rt = np->cork.rt; + fl = &inet->cork.fl; if (inet->cork.flags & IPCORK_OPT) opt = np->cork.opt; transhdrlen = 0; @@ -1423,7 +1424,7 @@ struct ipv6hdr *hdr; struct ipv6_txoptions *opt = np->cork.opt; struct rt6_info *rt = np->cork.rt; - struct flowi *fl = np->cork.fl; + struct flowi *fl = &inet->cork.fl; unsigned char proto = fl->proto; int err = 0; @@ -1487,9 +1488,7 @@ dst_release(&np->cork.rt->u.dst); np->cork.rt = NULL; } - if (np->cork.fl) { - np->cork.fl = NULL; - } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); return err; error: goto out; @@ -1514,7 +1513,5 @@ dst_release(&np->cork.rt->u.dst); np->cork.rt = NULL; } - if (np->cork.fl) { - np->cork.fl = NULL; - } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); } diff -Nru a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c --- a/net/ipv6/ipcomp6.c Fri Oct 31 14:10:53 2003 +++ b/net/ipv6/ipcomp6.c Fri Oct 31 14:10:53 2003 @@ -276,10 +276,15 @@ static int ipcomp6_init_state(struct xfrm_state *x, void *args) { - int err = -ENOMEM; + int err; struct ipcomp_data *ipcd; struct xfrm_algo_desc *calg_desc; + err = -EINVAL; + if (!x->calg) + goto out; + + err = -ENOMEM; ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); if (!ipcd) goto error; diff -Nru a/net/ipv6/mcast.c b/net/ipv6/mcast.c --- a/net/ipv6/mcast.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv6/mcast.c Fri Oct 31 14:10:54 2003 @@ -604,9 +604,9 @@ if (ipv6_addr_cmp(&psl->sl_addr[i], src_addr) == 0) break; } - if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count); + if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) rv = 0; - if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count); + if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) rv = 0; } read_unlock(&ipv6_sk_mc_lock); diff -Nru a/net/ipv6/udp.c b/net/ipv6/udp.c --- a/net/ipv6/udp.c Fri Oct 31 14:10:54 2003 +++ b/net/ipv6/udp.c Fri Oct 31 14:10:54 2003 @@ -720,8 +720,8 @@ { struct sk_buff *skb; struct udphdr *uh; - struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi *fl = np->cork.fl; + struct inet_opt *inet = inet_sk(sk); + struct flowi *fl = &inet->cork.fl; int err = 0; /* Grab the skbuff where UDP header space exists. */ @@ -783,15 +783,60 @@ struct in6_addr *daddr; struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; - struct flowi fl; + struct flowi *fl = &inet->cork.fl; struct dst_entry *dst; int addr_len = msg->msg_namelen; int ulen = len; - int addr_type; int hlimit = -1; int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; - + + /* destination address check */ + if (sin6) { + if (addr_len < offsetof(struct sockaddr, sa_data)) + return -EINVAL; + + switch (sin6->sin6_family) { + case AF_INET6: + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + daddr = &sin6->sin6_addr; + break; + case AF_INET: + goto do_udp_sendmsg; + case AF_UNSPEC: + msg->msg_name = sin6 = NULL; + msg->msg_namelen = addr_len = 0; + daddr = NULL; + break; + default: + return -EINVAL; + } + } else if (!up->pending) { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &np->daddr; + } else + daddr = NULL; + + if (daddr) { + if (ipv6_addr_type(daddr) == IPV6_ADDR_MAPPED) { + struct sockaddr_in sin; + sin.sin_family = AF_INET; + sin.sin_port = sin6 ? sin6->sin6_port : inet->dport; + sin.sin_addr.s_addr = daddr->s6_addr[3]; + msg->msg_name = &sin; + msg->msg_namelen = sizeof(sin); +do_udp_sendmsg: + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + return udp_sendmsg(iocb, sk, msg, len); + } + } + + if (up->pending == AF_INET) + return udp_sendmsg(iocb, sk, msg, len); + /* Rough check on arithmetic overflow, better check is made in ip6_build_xmit */ @@ -805,6 +850,10 @@ */ lock_sock(sk); if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET6)) { + release_sock(sk); + return -EINVAL; + } dst = NULL; goto do_append_data; } @@ -812,31 +861,19 @@ } ulen += sizeof(struct udphdr); - memset(&fl, 0, sizeof(fl)); + memset(fl, 0, sizeof(*fl)); if (sin6) { - if (sin6->sin6_family == AF_INET) { - if (__ipv6_only_sock(sk)) - return -ENETUNREACH; - return udp_sendmsg(iocb, sk, msg, len); - } - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - if (sin6->sin6_family && sin6->sin6_family != AF_INET6) - return -EINVAL; - if (sin6->sin6_port == 0) return -EINVAL; - up->dport = sin6->sin6_port; + fl->fl_ip_dport = sin6->sin6_port; daddr = &sin6->sin6_addr; if (np->sndflow) { - fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; - if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + fl->fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; daddr = &flowlabel->dst; @@ -854,48 +891,30 @@ if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) - fl.oif = sin6->sin6_scope_id; + fl->oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - up->dport = inet->dport; + fl->fl_ip_dport = inet->dport; daddr = &np->daddr; - fl.fl6_flowlabel = np->flow_label; - } - - addr_type = ipv6_addr_type(daddr); - - if (addr_type == IPV6_ADDR_MAPPED) { - struct sockaddr_in sin; - - if (__ipv6_only_sock(sk)) - return -ENETUNREACH; - - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = daddr->s6_addr32[3]; - sin.sin_port = up->dport; - msg->msg_name = (struct sockaddr *)(&sin); - msg->msg_namelen = sizeof(sin); - fl6_sock_release(flowlabel); - - return udp_sendmsg(iocb, sk, msg, len); + fl->fl6_flowlabel = np->flow_label; } - if (!fl.oif) - fl.oif = sk->sk_bound_dev_if; + if (!fl->oif) + fl->oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); - err = datagram_send_ctl(msg, &fl, opt, &hlimit); + err = datagram_send_ctl(msg, fl, opt, &hlimit); if (err < 0) { fl6_sock_release(flowlabel); return err; } - if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if ((fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); if (flowlabel == NULL) return -EINVAL; } @@ -907,28 +926,27 @@ if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); - fl.proto = IPPROTO_UDP; - ipv6_addr_copy(&fl.fl6_dst, daddr); - if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl_ip_dport = up->dport; - fl.fl_ip_sport = inet->sport; + fl->proto = IPPROTO_UDP; + ipv6_addr_copy(&fl->fl6_dst, daddr); + if (ipv6_addr_any(&fl->fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl->fl6_src, &np->saddr); + fl->fl_ip_sport = inet->sport; /* merge ip6_build_xmit from ip6_output */ if (opt && opt->srcrt) { struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + ipv6_addr_copy(&fl->fl6_dst, rt0->addr); } - if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) - fl.oif = np->mcast_oif; + if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst)) + fl->oif = np->mcast_oif; - err = ip6_dst_lookup(sk, &dst, &fl); + err = ip6_dst_lookup(sk, &dst, fl); if (err) goto out; if (hlimit < 0) { - if (ipv6_addr_is_multicast(&fl.fl6_dst)) + if (ipv6_addr_is_multicast(&fl->fl6_dst)) hlimit = np->mcast_hops; else hlimit = np->hop_limit; @@ -951,12 +969,12 @@ goto out; } - up->pending = 1; + up->pending = AF_INET6; do_append_data: up->len += ulen; err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), - hlimit, opt, &fl, (struct rt6_info*)dst, + hlimit, opt, fl, (struct rt6_info*)dst, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_v6_flush_pending_frames(sk); @@ -965,7 +983,7 @@ if (dst) ip6_dst_store(sk, dst, - !ipv6_addr_cmp(&fl.fl6_dst, &np->daddr) ? + !ipv6_addr_cmp(&fl->fl6_dst, &np->daddr) ? &np->daddr : NULL); if (err > 0) err = np->recverr ? net_xmit_errno(err) : 0; diff -Nru a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c --- a/net/ipv6/xfrm6_policy.c Fri Oct 31 14:10:53 2003 +++ b/net/ipv6/xfrm6_policy.c Fri Oct 31 14:10:53 2003 @@ -169,7 +169,7 @@ dst_prev->output = dst_prev->xfrm->type->output; /* Sheit... I remember I did this right. Apparently, * it was magically lost, so this code needs audit */ - x->u.rt6.rt6i_flags = rt0->rt6i_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL); + x->u.rt6.rt6i_flags = rt0->rt6i_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL|RTF_NDISC); x->u.rt6.rt6i_metric = rt0->rt6i_metric; x->u.rt6.rt6i_node = rt0->rt6i_node; x->u.rt6.rt6i_gateway = rt0->rt6i_gateway; diff -Nru a/net/llc/af_llc.c b/net/llc/af_llc.c --- a/net/llc/af_llc.c Fri Oct 31 14:10:53 2003 +++ b/net/llc/af_llc.c Fri Oct 31 14:10:53 2003 @@ -187,6 +187,8 @@ llc_release_sockets(llc->sap); llc_sap_close(llc->sap); } + if (llc->dev) + dev_put(llc->dev); sock_put(sk); llc_sk_free(sk); out: @@ -244,31 +246,20 @@ struct sock *sk = sock->sk; struct llc_opt *llc = llc_sk(sk); struct llc_sap *sap; - struct net_device *dev = NULL; int rc = -EINVAL; if (!sk->sk_zapped) goto out; - /* bind to a specific mac, optional. */ - if (!llc_mac_null(addr->sllc_smac)) { - rtnl_lock(); - dev = dev_getbyhwaddr(addr->sllc_arphrd, addr->sllc_smac); - rtnl_unlock(); - rc = -ENETUNREACH; - if (!dev) - goto out; - llc->dev = dev; - } /* bind to a specific sap, optional. */ - if (!addr->sllc_ssap) { + if (!addr->sllc_sap) { rc = -EUSERS; - addr->sllc_ssap = llc_ui_autoport(); - if (!addr->sllc_ssap) + addr->sllc_sap = llc_ui_autoport(); + if (!addr->sllc_sap) goto out; } - sap = llc_sap_find(addr->sllc_ssap); + sap = llc_sap_find(addr->sllc_sap); if (!sap) { - sap = llc_sap_open(addr->sllc_ssap, NULL); + sap = llc_sap_open(addr->sllc_sap, NULL); rc = -EBUSY; /* some other network layer is using the sap */ if (!sap) goto out; @@ -276,20 +267,14 @@ struct llc_addr laddr, daddr; struct sock *ask; - rc = -EUSERS; /* can't get exclusive use of sap */ - if (!dev && llc_mac_null(addr->sllc_mmac)) - goto out; memset(&laddr, 0, sizeof(laddr)); memset(&daddr, 0, sizeof(daddr)); - if (!llc_mac_null(addr->sllc_mmac)) { - if (sk->sk_type != SOCK_DGRAM) { - rc = -EOPNOTSUPP; - goto out; - } - memcpy(laddr.mac, addr->sllc_mmac, IFHWADDRLEN); - } else - memcpy(laddr.mac, addr->sllc_smac, IFHWADDRLEN); - laddr.lsap = addr->sllc_ssap; + /* + * FIXME: check if the the address is multicast, + * only SOCK_DGRAM can do this. + */ + memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); + laddr.lsap = addr->sllc_sap; rc = -EADDRINUSE; /* mac + sap clash. */ ask = llc_lookup_established(sap, &daddr, &laddr); if (ask) { @@ -297,11 +282,9 @@ goto out; } } - llc->laddr.lsap = addr->sllc_ssap; + llc->laddr.lsap = addr->sllc_sap; if (llc->dev) memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); - llc->daddr.lsap = addr->sllc_dsap; - memcpy(llc->daddr.mac, addr->sllc_dmac, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); @@ -334,7 +317,7 @@ struct sock *sk = sock->sk; int rc = -EINVAL; - dprintk("%s: binding %02X\n", __FUNCTION__, addr->sllc_ssap); + dprintk("%s: binding %02X\n", __FUNCTION__, addr->sllc_sap); if (!sk->sk_zapped || addrlen != sizeof(*addr)) goto out; rc = -EAFNOSUPPORT; @@ -386,9 +369,9 @@ * @flags: Operational flags specified by the user. * * Connect to a remote llc2 mac + sap. The caller must specify the - * destination mac and address to connect to. If the user previously - * called bind(2) with a smac the user does not need to specify the source - * address and mac. + * destination mac and address to connect to. If the user hasn't previously + * called bind(2) with a smac the address of the first interface of the + * specified arp type will be used. * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ @@ -413,14 +396,16 @@ rc = llc_ui_autobind(sock, addr); if (rc) goto out; + llc->daddr.lsap = addr->sllc_sap; + memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN); } if (!llc->dev) { - rtnl_lock(); - dev = dev_getbyhwaddr(addr->sllc_arphrd, addr->sllc_smac); - rtnl_unlock(); + rc = -ENODEV; + dev = dev_getfirstbyhwtype(addr->sllc_arphrd); if (!dev) goto out; llc->dev = dev; + memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); } else dev = llc->dev; if (sk->sk_type != SOCK_STREAM) @@ -432,7 +417,7 @@ sk->sk_state = TCP_SYN_SENT; llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap); rc = llc_establish_connection(sk, dev->dev_addr, - addr->sllc_dmac, addr->sllc_dsap); + addr->sllc_mac, addr->sllc_sap); if (rc) { dprintk("%s: llc_ui_send_conn failed :-(\n", __FUNCTION__); sock->state = SS_UNCONNECTED; @@ -491,12 +476,6 @@ add_wait_queue_exclusive(sk->sk_sleep, &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); - rc = -ERESTARTSYS; - if (signal_pending(current)) - break; - rc = -EAGAIN; - if (!timeout) - break; rc = 0; if (sk->sk_state != TCP_CLOSE) { release_sock(sk); @@ -504,6 +483,12 @@ lock_sock(sk); } else break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); @@ -521,12 +506,6 @@ rc = -EAGAIN; if (sk->sk_state == TCP_CLOSE) break; - rc = -ERESTARTSYS; - if (signal_pending(current)) - break; - rc = -EAGAIN; - if (!timeout) - break; rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); @@ -534,6 +513,12 @@ lock_sock(sk); } else break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); @@ -550,12 +535,6 @@ __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; - rc = -ERESTARTSYS; - if (signal_pending(current)) - break; - rc = -EAGAIN; - if (!timeout) - break; /* * Well, if we have backlog, try to process it now. */ @@ -570,6 +549,12 @@ lock_sock(sk); } else break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); @@ -589,12 +574,6 @@ rc = -ENOTCONN; if (sk->sk_shutdown & RCV_SHUTDOWN) break; - rc = -ERESTARTSYS; - if (signal_pending(current)) - break; - rc = -EAGAIN; - if (!timeout) - break; rc = 0; if (llc_data_accept_state(llc->state) || llc->p_flag) { release_sock(sk); @@ -602,6 +581,12 @@ lock_sock(sk); } else break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); @@ -625,7 +610,7 @@ int rc = -EOPNOTSUPP; dprintk("%s: accepting on %02X\n", __FUNCTION__, - llc_sk(sk)->addr.sllc_ssap); + llc_sk(sk)->addr.sllc_sap); lock_sock(sk); if (sk->sk_type != SOCK_STREAM) goto out; @@ -637,7 +622,7 @@ if (rc) goto out; dprintk("%s: got a new connection on %02X\n", __FUNCTION__, - llc_sk(sk)->addr.sllc_ssap); + llc_sk(sk)->addr.sllc_sap); skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) @@ -653,8 +638,6 @@ llc = llc_sk(sk); newllc = llc_sk(newsk); memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr)); - memcpy(newllc->addr.sllc_dmac, newllc->daddr.mac, IFHWADDRLEN); - newllc->addr.sllc_dsap = newllc->daddr.lsap; newllc->link = llc_ui_next_link_no(newllc->laddr.lsap); /* put original socket back into a clean listen state. */ @@ -662,7 +645,7 @@ sk->sk_ack_backlog--; skb->sk = NULL; dprintk("%s: ok success on %02X, client on %02X\n", __FUNCTION__, - llc_sk(sk)->addr.sllc_ssap, newllc->addr.sllc_dsap); + llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: kfree_skb(skb); out: @@ -765,10 +748,8 @@ goto release; } if (!llc->dev) { - rtnl_lock(); - dev = dev_getbyhwaddr(addr->sllc_arphrd, addr->sllc_smac); - rtnl_unlock(); - rc = -ENETUNREACH; + rc = -ENODEV; + dev = dev_getfirstbyhwtype(addr->sllc_arphrd); if (!dev) goto release; } else @@ -791,18 +772,18 @@ if (rc) goto out; if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { - llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_dmac, - addr->sllc_dsap); + llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); goto out; } if (addr->sllc_test) { - llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_dmac, - addr->sllc_dsap); + llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); goto out; } if (addr->sllc_xid) { - llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_dmac, - addr->sllc_dsap); + llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); goto out; } rc = -ENOPROTOOPT; @@ -850,17 +831,17 @@ goto out; if(llc->dev) sllc.sllc_arphrd = llc->dev->type; - sllc.sllc_dsap = llc->daddr.lsap; - memcpy(&sllc.sllc_dmac, &llc->daddr.mac, IFHWADDRLEN); + sllc.sllc_sap = llc->daddr.lsap; + memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN); } else { rc = -EINVAL; if (!llc->sap) goto out; - sllc.sllc_ssap = llc->sap->laddr.lsap; + sllc.sllc_sap = llc->sap->laddr.lsap; if (llc->dev) { sllc.sllc_arphrd = llc->dev->type; - memcpy(&sllc.sllc_smac, &llc->dev->dev_addr, + memcpy(&sllc.sllc_mac, &llc->dev->dev_addr, IFHWADDRLEN); } } diff -Nru a/net/llc/llc_conn.c b/net/llc/llc_conn.c --- a/net/llc/llc_conn.c Fri Oct 31 14:10:53 2003 +++ b/net/llc/llc_conn.c Fri Oct 31 14:10:53 2003 @@ -514,7 +514,8 @@ if (rc->sk_type == SOCK_STREAM && rc->sk_state == TCP_LISTEN && llc->laddr.lsap == laddr->lsap && - llc_mac_match(llc->laddr.mac, laddr->mac)) { + (llc_mac_match(llc->laddr.mac, laddr->mac) || + llc_mac_null(llc->laddr.mac))) { sock_hold(rc); goto found; } diff -Nru a/net/llc/llc_input.c b/net/llc/llc_input.c --- a/net/llc/llc_input.c Fri Oct 31 14:10:54 2003 +++ b/net/llc/llc_input.c Fri Oct 31 14:10:54 2003 @@ -40,13 +40,13 @@ struct sk_buff *skb)) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) - llc_type_handlers[type] = handler; + llc_type_handlers[type - 1] = handler; } void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) - llc_type_handlers[type] = NULL; + llc_type_handlers[type - 1] = NULL; } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) diff -Nru a/net/llc/llc_proc.c b/net/llc/llc_proc.c --- a/net/llc/llc_proc.c Fri Oct 31 14:10:53 2003 +++ b/net/llc/llc_proc.c Fri Oct 31 14:10:53 2003 @@ -44,15 +44,12 @@ read_lock_bh(&sap->sk_list.lock); sk_for_each(sk, node, &sap->sk_list.list) { if (!pos) - break; + goto found; --pos; } read_unlock_bh(&sap->sk_list.lock); - if (!pos) { - if (node) - goto found; + if (!pos) break; - } } sk = NULL; found: @@ -105,7 +102,7 @@ static void llc_seq_stop(struct seq_file *seq, void *v) { - if (v) { + if (v && v != SEQ_START_TOKEN) { struct sock *sk = v; struct llc_opt *llc = llc_sk(sk); struct llc_sap *sap = llc->sap; @@ -128,18 +125,16 @@ sk = v; llc = llc_sk(sk); - seq_printf(seq, "%2X %2X ", sk->sk_type, - !llc_mac_null(llc->addr.sllc_mmac)); + /* FIXME: check if the address is multicast */ + seq_printf(seq, "%2X %2X ", sk->sk_type, 0); - if (llc->dev && llc_mac_null(llc->addr.sllc_mmac)) + if (llc->dev) llc_ui_format_mac(seq, llc->dev->dev_addr); - else if (!llc_mac_null(llc->addr.sllc_mmac)) - llc_ui_format_mac(seq, llc->addr.sllc_mmac); else seq_printf(seq, "00:00:00:00:00:00"); seq_printf(seq, "@%02X ", llc->sap->laddr.lsap); - llc_ui_format_mac(seq, llc->addr.sllc_dmac); - seq_printf(seq, "@%02X %8d %8d %2d %3d %4d\n", llc->addr.sllc_dsap, + llc_ui_format_mac(seq, llc->daddr.mac); + seq_printf(seq, "@%02X %8d %8d %2d %3d %4d\n", llc->daddr.lsap, atomic_read(&sk->sk_wmem_alloc), atomic_read(&sk->sk_rmem_alloc), sk->sk_state, diff -Nru a/net/llc/llc_sap.c b/net/llc/llc_sap.c --- a/net/llc/llc_sap.c Fri Oct 31 14:10:54 2003 +++ b/net/llc/llc_sap.c Fri Oct 31 14:10:54 2003 @@ -54,10 +54,8 @@ addr->sllc_test = prim == LLC_TEST_PRIM; addr->sllc_xid = prim == LLC_XID_PRIM; addr->sllc_ua = prim == LLC_DATAUNIT_PRIM; - llc_pdu_decode_sa(skb, addr->sllc_smac); - llc_pdu_decode_da(skb, addr->sllc_dmac); - llc_pdu_decode_dsap(skb, &addr->sllc_dsap); - llc_pdu_decode_ssap(skb, &addr->sllc_ssap); + llc_pdu_decode_sa(skb, addr->sllc_mac); + llc_pdu_decode_ssap(skb, &addr->sllc_sap); } /** diff -Nru a/net/socket.c b/net/socket.c --- a/net/socket.c Fri Oct 31 14:10:53 2003 +++ b/net/socket.c Fri Oct 31 14:10:53 2003 @@ -1206,14 +1206,16 @@ * ready for listening. */ +int sysctl_somaxconn = SOMAXCONN; + asmlinkage long sys_listen(int fd, int backlog) { struct socket *sock; int err; if ((sock = sockfd_lookup(fd, &err)) != NULL) { - if ((unsigned) backlog > SOMAXCONN) - backlog = SOMAXCONN; + if ((unsigned) backlog > sysctl_somaxconn) + backlog = sysctl_somaxconn; err = security_socket_listen(sock, backlog); if (err) { diff -Nru a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c --- a/net/xfrm/xfrm_policy.c Fri Oct 31 14:10:54 2003 +++ b/net/xfrm/xfrm_policy.c Fri Oct 31 14:10:54 2003 @@ -519,7 +519,6 @@ *polp != NULL; polp = &(*polp)->next) { if (*polp == pol) { *polp = pol->next; - atomic_dec(&pol->refcnt); return pol; } } @@ -574,6 +573,7 @@ write_lock_bh(&xfrm_policy_lock); __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir); write_unlock_bh(&xfrm_policy_lock); + xfrm_pol_put(newp); } return newp; } @@ -853,6 +853,8 @@ xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, unsigned short family) { + if (xfrm_state_kern(x)) + return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family); return x->id.proto == tmpl->id.proto && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && (x->props.reqid == tmpl->reqid || !tmpl->reqid) && @@ -862,14 +864,23 @@ } static inline int -xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int idx, +xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start, unsigned short family) { + int idx = start; + + if (tmpl->optional) { + if (!tmpl->mode) + return start; + } else + start = -1; for (; idx < sp->len; idx++) { if (xfrm_state_ok(tmpl, sp->x[idx].xvec, family)) return ++idx; + if (sp->x[idx].xvec->props.mode) + break; } - return -1; + return start; } static int @@ -922,32 +933,35 @@ xfrm_policy_lookup); if (!pol) - return 1; + return !skb->sp; pol->curlft.use_time = (unsigned long)xtime.tv_sec; if (pol->action == XFRM_POLICY_ALLOW) { - if (pol->xfrm_nr != 0) { - struct sec_path *sp; - static struct sec_path dummy; - int i, k; - - if ((sp = skb->sp) == NULL) - sp = &dummy; - - /* For each tmpl search corresponding xfrm. - * Order is _important_. Later we will implement - * some barriers, but at the moment barriers - * are implied between each two transformations. - */ - for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) { - if (pol->xfrm_vec[i].optional) - continue; - k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family); - if (k < 0) - goto reject; - } + struct sec_path *sp; + static struct sec_path dummy; + int i, k; + + if ((sp = skb->sp) == NULL) + sp = &dummy; + + /* For each tunnel xfrm, find the first matching tmpl. + * For each tmpl before that, find corresponding xfrm. + * Order is _important_. Later we will implement + * some barriers, but at the moment barriers + * are implied between each two transformations. + */ + for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) { + k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family); + if (k < 0) + goto reject; + } + + for (; k < sp->len; k++) { + if (sp->x[k].xvec->props.mode) + goto reject; } + xfrm_pol_put(pol); return 1; } diff -Nru a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c --- a/net/xfrm/xfrm_state.c Fri Oct 31 14:10:54 2003 +++ b/net/xfrm/xfrm_state.c Fri Oct 31 14:10:54 2003 @@ -831,6 +831,7 @@ if (err >= 0) { xfrm_sk_policy_insert(sk, err, pol); + xfrm_pol_put(pol); err = 0; } diff -Nru a/patches/applied-patches b/patches/applied-patches --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/applied-patches Fri Oct 31 14:10:54 2003 @@ -0,0 +1,28 @@ +all-sources.diff +i386-sys_reiser4.diff +do_mmap2-fix.diff +uml-summa.diff +reiser4-fs-Makefile.diff +fsync_super.diff +reiser4-fs-Kconfig.diff +sb_sync_inodes.diff +export-generic_forget_inode.diff +spinlock-owner.diff +truncate_mapping_pages_range.diff +page-owner.diff +init_fixmap_vma.diff +export-remove_from_page_cache.diff +export-page_cache_readahead.diff +fs_activation.diff +static-inline-quotaops.diff +uml-asm-cpufeature-h.diff +uml-asm-local-h.diff +uml-kill-irq_kern.h.diff +uml-export-in-ksyms.c.diff +uml-sched_clock.diff +uml-AUTOCONF_INCLUDED.diff +uml-tty-init.diff +uml-kill-cow.diff +uml-asm-sections.diff +uml-asm-module-i386.h.diff +reget-page-mapping.diff diff -Nru a/patches/bin/added-by-patch b/patches/bin/added-by-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/added-by-patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,14 @@ +#!/bin/sh +# Extract names of new files from a patch, print them out + +PATCHFILE=$1 +case "$PATCHFILE" in +*.gz) CMD="gzip -d < $PATCHFILE";; +*) CMD="cat $PATCHFILE";; +esac + +TMP=$(mktemp /tmp/abp.XXXXXX) + +eval $CMD | egrep '^--- .*1969|^--- .*1970' > $TMP +sed -e 's@[^/]*/\([^ ]*\).*@\1@' < $TMP | sed -e 's@^linux/@@' | sort +rm -f $TMP diff -Nru a/patches/bin/apatch b/patches/bin/apatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/apatch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,101 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +do_apply() +{ + FILES=$(cat $P/pc/$PATCH_NAME.pc) + for file in $FILES + do + copy_file_to_bup $file $PATCH_NAME + done + + silent=-s + if [ $opt_force != 0 ] + then + silent= + fi + + if patch -p1 $silent -i "$1" || [ $opt_force != 0 ] + then + true + else + echo SOMETHING WENT WRONG + exit 1 + fi +} + +add_to_db() +{ + basename "$1" >> "$DB" +} + +usage() +{ + echo "Usage: apatch patchname" + exit 1 +} + +opt_force=0 +PATCH_NAMES="" + +for i in $* +do + case "$i" in + -f) + opt_force=1 + shift;; + *) + PATCH_NAMES="$PATCH_NAMES $i" + esac +done + +if [ x"$PATCH_NAMES" == x ] +then + usage +fi + +apatch() +{ + PATCH_NAME=$(stripit $1) + + need_file_there $P/pc/$PATCH_NAME.pc + + if is_applied "$PATCH_NAME" + then + echo "$PATCH_NAME" is already applied + exit 1 + fi + + if [ $opt_force != 0 ] + then + echo FORCING PATCH + fi + + if [ $opt_force != 0 ] || can_apply $P/patches/"$PATCH_NAME".patch + then + do_apply $P/patches/"$PATCH_NAME".patch + add_to_db "$PATCH_NAME" + echo applied $PATCH_NAME + echo + else + echo "$PATCH_NAME" does not apply + exit 1 + fi +} + +for i in $PATCH_NAMES +do + if ! apatch $i + then + exit 1 + fi +done + diff -Nru a/patches/bin/cat-series b/patches/bin/cat-series --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/cat-series Fri Oct 31 14:10:54 2003 @@ -0,0 +1,17 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +if [ $# -eq 0 ] +then + cat_series +else + __cat_series $1 +fi diff -Nru a/patches/bin/combine-applied b/patches/bin/combine-applied --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/combine-applied Fri Oct 31 14:10:54 2003 @@ -0,0 +1,46 @@ +#!/bin/sh + +# +# Make superpatch from currently applied patches using combinediff. +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: combine-applied output-file" + exit 1 +} + +if [ $# -ne 1 ] +then + usage +fi + +need_file_there applied-patches +CURRENT=$(mktemp /tmp/cmbd-XXXXXXXX) +for FILE in `cat applied-patches` +do + NEXT=$(mktemp /tmp/cmbd-XXXXXXXX) + if [ -f $P/patches/$FILE ] + then + combinediff $CURRENT $P/patches/$FILE > $NEXT + elif [ -f $P/patches/$FILE.patch ] + then + combinediff $CURRENT $P/patches/$FILE.patch > $NEXT + elif [ -f $FILE ] + then + combinediff $CURRENT $FILE > $NEXT + fi + rm $CURRENT + CURRENT=$NEXT +done + +mv $NEXT "$1" diff -Nru a/patches/bin/combine-series b/patches/bin/combine-series --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/combine-series Fri Oct 31 14:10:55 2003 @@ -0,0 +1,46 @@ +#!/bin/sh + +# +# Make superpatch from current series using combinediff. +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: combine-series output-file" + exit 1 +} + +if [ $# -ne 1 ] +then + usage +fi + +need_file_there series +CURRENT=$(mktemp /tmp/cmbd-XXXXXXXX) +for FILE in $(cat_series) +do + NEXT=$(mktemp /tmp/cmbd-XXXXXXXX) + if [ -f $P/patches/$FILE ] + then + combinediff $CURRENT $P/patches/$FILE > $NEXT + elif [ -f $P/patches/$FILE.patch ] + then + combinediff $CURRENT $P/patches/$FILE.patch > $NEXT + elif [ -f $FILE ] + then + combinediff $CURRENT $FILE > $NEXT + fi + rm $CURRENT + CURRENT=$NEXT +done + +mv $NEXT "$1" diff -Nru a/patches/bin/cvs-take-patch b/patches/bin/cvs-take-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/cvs-take-patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,78 @@ +#!/bin/sh + +doit() +{ + echo $* + $* +} + +usage() +{ + echo "Usage: cvs-take-patch patch_file_name" + exit 1 +} + +# +# Find the highest level directory in $1 which does not +# contain the directory $2. Return it in $MISSING +# +highest_missing() +{ + START_DIR="$1" + NAME="$2" + MISSING="" + WHERE=$(dirname "$START_DIR") + PREV_WHERE=$START_DIR + while [ x"$WHERE" != x"$PREV_WHERE" ] + do + WHERE="$PREV_WHERE" + if [ ! -d "$WHERE"/"$NAME" ] + then + MISSING="$WHERE" + fi + PREV_WHERE=$(dirname "$WHERE") + done + echo highest_missing returns $MISSING +} + +# +# Add all new directries to CVS, top-down +# $1: name of a directory +# $2: name of the CVS directory +# +add_cvs_dirs() +{ + MISSING=foo + while [ "$MISSING" != "" ] + do + highest_missing $1 $2 + if [ x"$MISSING" != "x" ] + then + if [ ! -d "$MISSING"/"$2" ] + then + doit cvs add $MISSING + fi + fi + done +} + +PATCHFILE=$1 + +REMOVEDFILES=$(removed-by-patch $PATCHFILE) +if [ "$REMOVEDFILES" != "" ] +then + doit cvs remove $REMOVEDFILES +fi + +NEWFILES=$(added-by-patch $PATCHFILE) +for i in $NEWFILES +do + DIRNAME=$(dirname $i) + echo "Looking at $DIRNAME" + add_cvs_dirs $DIRNAME CVS +done + +if [ "$NEWFILES" != "" ] +then + doit cvs add $NEWFILES +fi diff -Nru a/patches/bin/docco.txt b/patches/bin/docco.txt --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/docco.txt Fri Oct 31 14:10:55 2003 @@ -0,0 +1,717 @@ +Patch management scripts +Andrew Morton <akpm@digeo.com> +18 October 2002 + +This is a description of a bunch of shell scripts which I use for +managing kernel patches. They are quite powerful. They can be used on +projects other than the linux kernel. They are easy to use, and fast. + +You end up doing a ton of recompiling with these scripts, because +you're pushing and popping all the time. ccache takes away the pain of +all that. http://ccache.samba.org/ - be sure to put the cache +directory on the same fs as where you're working so that ccache can use +hardlinks. + +The key philosophical concept is that your primary output is patches. +Not ".c" files, not ".h" files. But patches. So patches are the +first-class object here. + +Installation +============ + +You place all the scripts somewhere in your path, or in +/usr/lib/patch-scripts. + +Terminology +=========== + +The patch scripts require three special directories called "pc", +"patches" and "txt". + +If the environment variable PATCHSCRIPTS is set, it is taken to to be +the directory in which those three directories reside. Typically, it +would be a relative pathname. So + + setenv PATCHSCRIPTS ./i-put-them-here + +would tell the patch scripts to look in ./i-put-them-here/pc, etc. + +If PATCHSCRIPTS is not set, and the directory ./patch-scripts is +present then the patch scripts will us ./patch-scripts/pc/, +./patch-scripts/patches/ and ./patch-scripts/txt/. + +Otherwise, the patch scripts use ./pc, ./patches and ./txt. + +In this document, the symbol $P is used to describe the directory which +holds the pc/, patches/ and txt/ directories, as determined by the +above search. + +It is expected that $P will always expand to a relative path. + +Concepts +======== + +All work occurs with a single directory tree. All commands are invoked +within the root of that tree. The scripts manage a "stack" of patches. + +Each patch is a changeset against the base tree plus the preceding patches. + +All patches are listed, in order, in the file ./series. You manage the +series file. Lines in the series file which start with `#' are ignored. + +Any currently-applied patches are described in the file +./applied-patches. The patch scripts manage this file. + +Each patch affects a number of files in the tree. These files are +listed in a "patch control" file. These .pc files live in the +directory $P/pc/ + +Patches are placed in the directory $P/patches/ + +Documentation for the patches is placed in $P/txt/ + +So for a particular patch "my-first-patch" the following will exist: + +- An entry "my-first-patch.patch" in ./series + +- An entry "my-first-patch" in ./applied-patches (if it's currently applied) + +- A file $P/pc/my-first-patch.pc which contains the names of the + files which my-first-patch modifies, adds or removes + +- A file $P/txt/my-first-patch.txt which contains the patch's + changelog. + +- A file $P/patches/my-first-patch.patch, which is the output of the + patch scripts. + +Operation +========= + +When a patch "my-patch" is applied with apatch, or with pushpatch +(which calls apatch), all the affected files (from $P/pc/my-patch.pc) +are copied to files with ~my-patch appended. So if $P/pc/my-patch.pc +contained + + kernel/sched.c + fs/inode.c + +then apatch will copy those files into kernel/sched.c~my-patch and +fs/inode.c~my-patch. It will then apply the patch to kernel/sched.c +and fs/inode.c + +When a diff is regenerated by refpatch (which calls mpatch), the diff +is made between kernel/sched.c and kernel/sched.c~my-patch. How do the +scripts know to use "~my-patch"? Because my-patch is the current +topmost patch. It's the last line in ./applied-patches. + +In this way, the whole thing is stackable. If you have four patches +applied, say "patch-1", "patch-2", "patch-3" and "patch-4", and if +patch-2 and patch-4 both touch kernel/sched.c then you will have: + + kernel/sched.c~patch-2 Original copy, before patch-2 + kernel/sched.c~patch-4 Copy before patch-4. Contains changes + from patch-2 + kernel/sched.c Current working copy. Contains changes + from patch-4. + +This means that your diff headers contain "~patch-name" in them, which +is convenient documentation. + +Walkthrough +=========== + +Let's start. + +Go into /usr/src/linux (or wherever) + + mkdir pc patches txt + +Now let's generate a patch + + fpatch my-patch kernel/sched.c + +OK, we've copied kernel/sched.c to kernel/sched.c~my-patch. We've +appended "my-patch" to ./applied-patches and we've put "kernel/sched.c" +into the patch control file, pc/my-patch.pc. + + Now edit kernel/sched.c a bit. + +Now we're ready to document the patch + + Now write txt/my-patch.txt + +Now generate the patch + + refpatch + +This will generate patches/my-patch.patch. Take a look. + +Now remove the patch + + poppatch + +applied-patches is now empty, and the patch is removed. + +Now let's add a file to my-patch and then generate my-second-patch: + + Add "my-patch.patch" to ./series (no blank lines in that file please) + + pushpatch + +OK, the patch is applied again. Let's add another file + + fpatch kernel/printk.c + +Note that here we gave fpatch a single argument. So rather than +opening a new patch, it adds kernel/printk.c to the existing topmost +patch. That's my-patch. + + Edit kernel/printk.c + +Refresh my-patch (you end up running refpatch a lot) + + refpatch + +Now start a second patch: + + fpatch my-second-patch kernel/sched.c + +Now take a look at applied-patches. Also do an `ls kernel/sched*'. + + Edit kernel/sched.c, to make some changes for my-second-patch + +Generate my-second-patch: + + refpatch + +Take a look in patches/my-second-patch.patch + +Don't forget to add "my-second-patch.patch" to the series file. + +And remove both patches: + + poppatch + poppatch + + +That's pretty much it, really. + + +Command reference +================= + +Generally, where any of these commands take a "patch-name", that can be +of the form txt/patch-name.txt, patch-name.pc, just patch-name or +whatever. The scripts will strip off a leading "txt/", "patches/" or +"pc/" and any trailing extension. This is so you can do + + apatch patches/a<tab> + +to conveniently use shell tabbing to select patch names. + + + +added-by-patch + + Some internal thing. + +apatch [-f] patch-name + + This is the low-level function which adds patches. It does the + copying into ~-files and updates the applied-patches file. It + applies the actual patch. + + apatch will do a patch --dry-run first and will refuse to apply the + patch if the dryrun fails. + + So when you are getting rejects you do this: + + pushpatch # This fails, due to rejects. Drat. + apatch -f patch-name # Force the patch + (or) pushpatch -f # Force the patch + + OK, you've now applied patch-name, but you have rejects. Go fix + those up and do + + refpatch + + And you're ready to move on. + +combine-series output-file + + It incrementally combinediffs all the patches in series to make a + complete patch for the series. Requires combinediff frmo patchutils. + + See http://cyberelk.net/tim/patchutils/ (Don't download the + "experimental" patchutils - it seems to only have half of the + commands in it. Go for "stable") + +cvs-take-patch + + I forget. + +export_patch + + export the patches listed in ./series to a set of files which + are named in such a way that the sort order is the same as the + order of the series file. + + Usage: export_patch directory [prefix] + + Example: + + Suppose ./series contains + + mango.patch + orange.patch + banana.patch + apple.patch + pear.patch + + export_patch ../mypatches fruit + + The patches would be copied to + + ../mypatches/p00001_fruit_mango.patch + ../mypatches/p00002_fruit_orange.patch + ../mypatches/p00003_fruit_banana.patch + ../mypatches/p00003_fruit_banana.patch + ../mypatches/p00003_fruit_banana.patch + + Named in this way, someone may easily apply them: + + cat mypatches/p*fruit* | patch -p1 + + If prefix is omitted, the patchnames will be transformed + such that "original.patch" becomes "pXXXXX_original.patch". + +fpatch [patch-name] foo.c + + If patch-name is given, fpatch will start a new patch which + modifies (or adds, or removes) the single file foo.c. It updates + ./applied-patches and creates pc/patch-name.pc. fpatch will copy + foo.c to foo.c~patch-name in preparation for edits of foo.c. + + If patch-name is not given then fpatch will add foo.c to the + current topmost patch. It will add "foo.c" to $P/pc/$(toppatch).pc. + It will copy foo.c to foo.c~$(toppatch). + +import_patch + + Imports a set of patch files, creating $P/pc, $P/txt, $P/patches and + ./series as necessary. It also creates $P/txt/*.txt by stripping + off the top of the patches (and removes any diffstat output it finds, + so that it can eat refpatch output and export_patch output.) The + imported patch names are appended to the series file. + + In creating the $P/txt/*.txt files, mail headers are stripped with + formail, preserving the "From:" and "Subject:" lines. "DESC" and + "EDESC" markers are added if they are not already present, using the + "From:" and "Subject:" lines for the DESC portion, if they are present. + (See "patchdesc" command, below, for more on these markers.) + + Also, it can rename the patch file as it is imported by stripping out + a pattern. This is useful if, as often is the case, you have patch + sets with filenames designed to help sort the patches into the correct + order, such as "p001_xxx_funky_stuff.patch" you can have it automatically + renamed to funky_stuff.patch on import, and let the series file manage + the ordering. + + Import_patch will uncompress patches (*.Z, *.bz2, *.gz) as necessary. + + Usage: + + import_patch [-p pattern] patchfile ... + + Example: + + % ls ../fruit/p*patch + ../fruit/p00001_northern_apple.patch + ../fruit/p00001_tropical_mango.patch + ../fruit/p00002_northern_pear.patch + ../fruit/p00002_tropical_orange.patch + ../fruit/p00003_tropical_banana.patch + % import_patch -p 'p[0-9]*_tropical_' ../fruit/p*tropical* + Recreated pc/mango.pc + Recreated pc/orange.pc + Recreated pc/banana.pc + % import_patch -p 'p[0-9]*_northern_' ../fruit/p*northern* + Recreated pc/apple.pc + Recreated pc/pear.pc + + Then you can "pushpatch; refpatch" 5 times. + +inpatch + + List the names of ths files which are affected by the current + topmost patch. + + This is basically + + cat pc/$(toppatch).pc + +join-patch patchname + + "joins" the named patch to the current topmost patch. + + Use this when you want to merge two patches into one. All the + files which `patchname' affects are added to pc/$(toppatch).pc (if + they are not already there) and patch `patchname' is applied. The + top patch remains unchanged. You'll need to run refpatch afterwards. + +mpatch + + A low-level thing to generate patches + +new-kernel + + Some thing I use for importing a new kernel from kernel.org + +p0-2-p1 + + Internal thing to convert patch -p0 form into patch -p1 + +patchdesc + + Generates a single-line description of a patch. + + The txt/my-patch.txt files have the following format: + + <start of file> + DESC + some short description + EDESC + + The long description + <end of file> + + I use + + patchdesc $(cat series) + + to generate short-form summaries of the patch series. + +patchfns + + Internal utilities + +pcpatch + + Standalone tool to generate a .pc file from a patch. + + Say someone sends you "his-patch.diff". What you do is: + + cp ~/his-patch.diff patches/his-patch.patch + pcpatch his-patch + + This generates $P/pc/his-patch.pc and you're all set. Add + "his-patch.patch" to ./series in the right place and start pushing. + +p_diff + + I forget + +poppatch + + Remove one or more patches from the current stack. This command + does *not* use the series file. It works purely against + applied-patches. + + Usage: + + poppatch + Remove the topmost patch + poppatch 10 + Remove ten patches + poppatch some-patch-name[.patch] + Remove patches until "some-patch-name" is top patch + +pstatus + + Shows status of patches + + Usage: + pstatus [patchfile ...] + + One line per patch is output showing: + 1: Patch number in the series file + 2: Whether the patch is currently applied + 3: Name of patch + 4: Status of the patch (needs pcpatch, changelog, refpatch) + + If no patchfiles are specified, $P/patches/*.patch + are assumed. + + Caveats: + A patch set which contains separate patches to add a file + and modify that same file may give spurious "Needs refpatch" + status for the patch which adds the file or the topmost patch. + +ptkdiff + + Two modes: + + ptkdiff - + + Run tkdiff against all the file affected + by $(toppatch). The diff is only for the changes made + by the top patch! ie: it's between "filename" and + "filename~toppatch-name". + + ptkdiff filename + + Just run tkdiff against that file, + showing the changes which are due to toppatch. + +pushpatch [-f] + + Apply the next patch, from the series file. + + This consults ./applied-patches to find out the top patch, then + consults ./series to find the next patch. And pushes it. + + pushpatch + + Apply the next patch + + pushpatch 10 + + Apply the next ten patches + + pushpatch some-patch-name + + Keep pushing patches until "some-patch-name" is toppatch + + pushpatch -f + + Push the next patch, ignoring rejects. + +refpatch + + regnerates the topmost patch. Reads all the affected files + from pc/$(toppatch).pc and diffs them against their tilde-files. + + Also pastes into the patch your patch documentation and + generates a diffstat summary. + +removed-by-patch + + Some thing. + +rename-patch + + CVS rename for patches. + +rolled-up-patch + + Bit of a hack. Is designed to generate a rolled-up diff of all + currently-applied patches. But it requires a ../linux-2.x.y tree to + diff against. Needs to be redone. + +rpatch + + Internal command + +split-patch + + Some thing someone write to split patches up. I don't use it. + +tag-series + + Assuming you keep pc/*, patches/* and txt/* under CVS revision + control, tag-series allows you to tag a patchset's individual + components. I use + + tag-series s2_5_44-mm3 pc/2.5.44-mm3-series + + which will attach the cvs tag "s2_5_44-mm3" to every .pc, .patch + and .txt file which is mentioned in the series file + "pc/2.5.44-mm3-series". + + It will also tag pc/2.5.44-mm3-series, which is a bit redundant + given that I use a different series file for each patchset release.. + + +toppatch + + Print the name of the topmost patch. From ./applied-patches + +touched-by-patch patch-filename + + List the names of files which are affected by a diff. + +unitdiff.py + + Rasmus Andersen's script to convert a diff into minimum-context + form. This form has a better chance of applying if you're getting + nasty rejects. But patch can and will make mistakes when fed + small-context input. + + +Work Practices +============== + +I keep the kernel tree, the $P/pc/, $P/patches/ and $P/txt/ contents under +CVS control. This is important... + +I have several "series" files. I keep these in $P/pc/foo-series and use + + ln -s pc/foo-series series + +when I'm working on foo. + +If someone sends me a patch I'll do: + + cp ~/whatever patches/his-patch.patch + pcpatch his-patch + apatch his-patch + + If apatch fails then run `apatch -f his-patch' and fix the rejects. + + refpatch + + to clean up any fuzz. + + poppatch + cvs add pc/his-patch.pc patches/his-patch.patch + cvs commit pc patches + + Now edit ./series and place "his-patch.patch" in the appropriate place. + + +If you're working on a particular patch (say, "dud-patch") and you +balls something up, just run: + + refpatch # Generate the crap patch + poppatch # Remove it all + rm patches/dud-patch.patch + cvs up patches/dud-patch.patch + +and all is well. + + +Getting updates from Linus +========================== + +What I do is to grab the latest -bk diff from +http://www.kernel.org/pub/linux/kernel/people/dwmw2/bk-2.5/ +and do: + + gzip -d < cs<tab> > patches/linus.patch + pcpatch linus + apatch linus | grep diff + + Now fix up all the files which got deleted, + because there's something wrong with bitkeeper diffs: + + cvs up -ko <missing files from the above diff> + + apatch linus + $EDITOR linus/linus.txt + + Add the changeset number to txt/linus.txt + + refpatch + poppatch + + Now add "linus.patch" as the first entry in your ./series file and + start pushing your other patches on top of that. + +BUGS +==== + +Tons and tons. The scripts are fragile, the error handling is ungraceful and +if you do something silly you can end up in a pickle. + +Generally the scripts are very careful to not wreck your files or your +patches. But they can get the ./applied-patches and ~-files into an +awkward state. + +Usually you can sort it out by copying the ~-files back onto the originals +and removing the last line from ./applied-patches. Or do a "refpatch ; +poppatch ; rm patches/troublesome-patch.patch ; cvs up patches". + +If it's really bad, just blow away the entire tree and do a new CVS checkout. + + +Working on non-kernel projects +============================== + +Well it's the same thing. Say you've downloaded a copy of util-linux +and you want to make a change: + + cd /usr/src + tar xvfz ~/util-linux.tar.gz + cd util-linux + mkdir pc patches txt + fpatch my-patch sys-utils/rdev.c + fpatch sys-utils/ipcs.8 + <edit, edit> + refpatch + <ship patches/my-patch.patch> + +How to balls things up +====================== + +Well here's one way. Suppose you have 20 patches applied, and three of +them (say, "p1", "p6" and "p11") all modify "foo.c". + +Now you go and change foo.c. + +Well, to which patch does that change belong? You need to decide. +Let's say you decide "p6". + +If you run `refpatch' when "p11" is toppatch then you lose. The diff +went into p11. + +What you can do is: + +1: + poppatch p6 + <edit> + refpatch + pushpatch p11 + <test> + + (See why ccache is looking good?) + +or + +2: + <edit> + <test> + poppatch p6 <hope like hell that the other patches remove cleanly> + refpatch + + +Another good way of ballsing up is to cheat. Say "oh I just want to make +this one-line change". And "oh, and this one". + +Now you're getting in a mess. It's much, much better to just use the system: + + fpatch junk file1 + fpatch file2 + <edit> + <play> + refpatch + poppatch + rm pc/junk.pc patches/junk.patch + +Merging with -mm kernels +======================== + +Haven't tried this, but it should work: + +- Grab all the patches from broken-out/, place them in your $P/patches/ + +- Copy my series file into ./series (or $P/pc/akpm-series and symlink it) + +- pushpatch 99 + +And you're off and running. The nice thing about this is that you can +send me incremental diffs to diffs which I already have. + +Or whatever. I'm fairly handy with diffs nowadays. Rejects are +expected. I just prefer to have "one concept per diff". + diff -Nru a/patches/bin/export_patch b/patches/bin/export_patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/export_patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,58 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "export_patch: export the patches listed in $P/series" 1>&2 + echo "usage: export_patch destination-directory [prefix] " 1>&2 + exit 1 +} + +DIR="$1" +PREFIX="$2""_" + +if [ "$DIR" = "" ] +then + usage +fi + +if [ -e "$DIR" -a ! -d "$DIR" ] +then + echo "$DIR exists already, but is not a directory." 1>&2 + exit 1 +fi + +if [ ! -r $P/series ] +then + echo "$P/series is not readable." 1>&2 + exit 1 +fi + +mkdir -p "$DIR" || exit 1 + +count=1 +for x in $(cat_series) +do + fname=`echo "$count" "$PREFIX" "$x" |\ + awk '{ if ( $2 != "_" ) + printf("p%05d_%s%s\n", $1, $2, $3); + else + printf("p%05d_%s\n", $1, $3); + }'` + if [ ! -r $P/patches/"$x" ] + then + echo "$P/patches/"$x" is not readable. skipping." 1>&2 + continue; + fi + cp -f $P/patches/"$x" "$DIR"/"$fname" || continue; + count=`expr $count + 1` +done + diff -Nru a/patches/bin/extract_description b/patches/bin/extract_description --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/extract_description Fri Oct 31 14:10:55 2003 @@ -0,0 +1,87 @@ +#!/bin/sh + +insert_line() +{ + PATTERN="$1" + LINE="$2" + FILE="$3" + awk ' BEGIN { found=0; } + /'"$PATTERN"'/ { + print; + if (!found) + printf("%s\n", "'$LINE'"); + found=1; + next; + } + { print; } + ' < "$FILE" +} + +# extract the description from the top of a patch +# filter stdin +# collapse adjacent blank lines to a single blank line +# remove any lines that look like diffstat output +# stop output on encountering a line beginning with '---' (beginning of patch) + + TMPFILE=`mktemp /tmp/xdtmp.XXXXXX` || exit 1 + formail -kfcb -X 'From:' -X 'Subject:' |\ + awk ' + BEGIN { found_end=0; lastone="x"; } + /^ .* [|] +[0-9]+ [+-]+$/ { + #/* we found something like diffstat output... */ + if (found_end == 1) { + /* we are past end of diffstat, let it pass */ + print; + } + next; + } + /^ [1-9][0-9]* files changed/ { + #/* end of diffstat output, stop filtering diffstat */ + found_end=1; + next; + } + /^--- / { exit; } + { + #/* collapse adjacent blank lines to 1 blank line */ + if ( $0 == "" && lastone == "" ) + next; + else + print; + lastone=$0; + } + ' | awk '{ if ($0 == "" && FNR == 1) next; print; }' > "$TMPFILE" + + descs=`head -10 $TMPFILE | grep -c '^[ ]*DESC[ ]*$'` + if [ "$descs" = "0" ] + then + # DESC is not 1st non blank line in the file + echo "DESC" + descs=0 + fi + edescs=`grep -c '^EDESC$' "$TMPFILE"` + subjects=`grep -c '^[ ]*Subject[:]' "$TMPFILE"` + froms=`grep -c '^[ ]*From[:]' "$TMPFILE"` + if [ "$edescs" = "0" ] + then + if [ "$subjects" != "0" ] + then + insert_line '^Subject[:]' 'EDESC' "$TMPFILE" + else + if [ "$froms" != "0" ] + then + insert_line '^From[:]' 'EDESC' "$TMPFILE" + else + if [ "$descs" = "0" ] + then + # blank DESC line... + echo '(undescribed patch)' + echo EDESC + cat "$TMPFILE" + else + insert_line '^DESC$' "EDESC" "$TMPFILE" + fi + fi + fi + else + cat $TMPFILE + fi diff -Nru a/patches/bin/fpatch b/patches/bin/fpatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/fpatch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,66 @@ +#!/bin/sh + +# +# Add a file to a patch. +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: fpatch patchname filename filename ..." + echo " fpatch filename" + exit 1 +} + + +doit() +{ + patch_name=$1 + file_name=$2 + if is_applied_last $patch_name + then + true + else + if is_applied $patch_name + then + echo $patch_name is not the last-applied patch + exit 1 + else + echo $patch_name >> $DB + fi + fi + + + if file_in_patch $file_name $patch_name + then + echo File $file_name is already in patch $patch_name + exit 1 + fi + + install_file_in_patch $file_name $patch_name +} + +if [ $# == 1 ] +then + PATCH_NAME=$(top_patch) + FILENAME=$1 + doit $PATCH_NAME $FILENAME +elif [ $# -ge 2 ] +then + PATCH_NAME=$(stripit $1) + shift + for i in $* + do + doit $PATCH_NAME $i + done +else + usage +fi diff -Nru a/patches/bin/import_patch b/patches/bin/import_patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/import_patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,105 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "usage: import_patch [ -p prefix-pattern ] patchfile [...]" 1>&2 + exit 1 +} + +XPATTERN="" +if [ "$1" = "-p" ] +then + XPATTERN="$2" + shift; + shift; +fi + +if [ "$1" = "" ] +then + usage +fi + +if [ ! -e applied-patches ] +then + touch applied-patches +fi + +mkdir -p patches || exit 1 +mkdir -p txt || exit 1 +mkdir -p pc || exit 1 + +if [ ! -e ./series ] +then + touch ./series + if [ "$?" != "0" ] + then + echo "Cannot create ./series" 1>&2 + exit 1 + fi +fi + +if [ ! -w ./series ] +then + echo "./series is not writable." 1>&2 + exit 1 +fi + +PATTERN='s/^'"$XPATTERN"'//' +for x in $* +do + if [ ! -r "$x" ] + then + echo "$x does not exist, skipping." 1>&2 + continue + fi + patchname=`basename $x .bz2` + patchname=`basename $patchname .gz` + patchname=`basename $patchname .Z` + patchname=`basename $patchname .patch` + if is_applied $patchname + then + echo $patchname is currently applied + exit 1 + fi + if [ "$XPATTERN" != "" ] + then + patchname=`echo $patchname | sed -e "$PATTERN"` + fi + pname=$P/patches/"$patchname".patch + if [ -r "$pname" ] + then + echo "$pname exists already, skipping." 1>&2 + continue + fi + case "$x" in + *.bz2) + bunzip2 < "$x" > "$pname" + ;; + *.gz) + gunzip < "$x" > "$pname" + ;; + *.Z) zcat < "$z" > "$pname" + ;; + *) + cat "$x" > "$pname" || continue + ;; + esac + echo "$patchname".patch >> series + pcpatch "$pname" + extract_description < "$pname" >$P/txt/"$patchname".txt + grep '^[(]undescribed patch[)]$' < $P/txt/"$patchname".txt > /dev/null + if [ "$?" = "0" ] + then + echo "Warning: $patchname has no description." 1>&2 + fi +done + diff -Nru a/patches/bin/inpatch b/patches/bin/inpatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/inpatch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,30 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: inpatch" + exit 1 +} + +if [ $# != 0 ] +then + usage +fi + +if [ -e $DB ] +then + TOP_PATCH=$(top_patch) + if [ x$TOP_PATCH != x ] + then + cat $P/pc/$TOP_PATCH.pc + fi +fi diff -Nru a/patches/bin/join-patch b/patches/bin/join-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/join-patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,37 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: join-patch patchname" + exit 1 +} + +if [ $# -ne 1 ] +then + usage +fi + +PATCHNAME=$(stripit $1) + +if ! can_apply $PATCHNAME +then + echo Patch $PATCHNAME does not apply + exit 1 +fi + +pcpatch $PATCHNAME +for i in $(cat $P/pc/$PATCHNAME.pc) +do + fpatch $i +done + +patch -p1 -i "$P/patches/$PATCHNAME.patch" -f diff -Nru a/patches/bin/linus-patch b/patches/bin/linus-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/linus-patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,29 @@ +#!/bin/sh +# +# Grab a patch frmo kernel.org, install it. +# +# Usage: linus-patch http://www.kernel.org/pub/linux/kernel/people/dwmw2/bk-2.5/cset-1.786.152.7-to-1.798.txt.gz +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +poppatch 999 || die poppatch +wget $1 || die wget +FILE=$(basename $1) +gzip -d < $FILE > $P/patches/linus.patch +pcpatch linus || die pcpatch +( + echo DESC + echo $FILE + echo EDESC + echo + echo $FILE +) > $P/txt/linus.txt +rm $FILE diff -Nru a/patches/bin/mpatch b/patches/bin/mpatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/mpatch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,105 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: mpatch patchname [output_dir]" + exit 1 +} + +doit() +{ + echo $* 1>&2 + $* || { + echo oops + exit 1 + } +} + +epoch() +{ +# doit touch -t 7001011000.00 $1 + doit touch -t 7001010000.00 $1 +} + +dirfor() +{ + dir=$(dirname $1) + if [ ! -d $dir ] + then + doit mkdir -p $dir + RMDIRS="$RMDIRS $dir" + fi +} + +if [ $# == 0 ] +then + usage +fi + +PATCH_NAME=$(stripit $1) +OUTPUT_DIR=$2 + +FILES=$(cat $P/pc/$PATCH_NAME.pc) +OUT=$P/patches/$PATCH_NAME.patch +TMPOUT=$(mktemp /tmp/patch-$PATCH_NAME-XXXXXX) +TXT=$P/txt/$PATCH_NAME.txt +OLDDIR=$(basename $(/bin/pwd)) +NEWDIR=$OLDDIR-$LOGNAME + +if is_applied_last $PATCH_NAME +then + true +else + echo $PATCH_NAME is not the last-applied patch + exit 1 +fi + +doit rm -f $OUT +echo "Placing patch in " $OUT + +if [ -e $TXT -a -s $TXT ] +then + echo >> $OUT + body $TXT >> $OUT + echo >> $OUT + echo >> $OUT +else + echo "**** No patch description for $PATCH_NAME ****" +fi + +rm -f $TMPOUT + +for file in $FILES +do + OLD_FILE="$file"~"$PATCH_NAME" + if [ ! -e $OLD_FILE ] + then + OLD_FILE=/dev/null + fi + NEW_FILE=$file + XDIFF_OPTS="" + if [ ! -e $NEW_FILE ] + then + NEW_FILE=/dev/null + XDIFF_OPTS="-L $file" + fi + + echo diff -puN $XDIFF_OPTS $DIFF_OPTS $OLD_FILE $NEW_FILE + echo diff -puN $XDIFF_OPTS $DIFF_OPTS $OLD_FILE $NEW_FILE >> $TMPOUT + diff -puN $XDIFF_OPTS $DIFF_OPTS $OLD_FILE $NEW_FILE | $PATCHSCRIPTS_LIBDIR/p0-2-p1 $OLDDIR $NEWDIR >> $TMPOUT +done +diffstat -p1 $TMPOUT >> $OUT 2>/dev/null +echo >> $OUT +cat $TMPOUT >> $OUT +echo >> $OUT +echo "_" >> $OUT +rm -f $TMPOUT diff -Nru a/patches/bin/new-kernel b/patches/bin/new-kernel --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/new-kernel Fri Oct 31 14:10:54 2003 @@ -0,0 +1,82 @@ +#!/bin/sh + +usage() +{ + echo "Usage: new-kernel linux-2.4.2-pre2 linux-2.4.3-pre3 linux-2.4.3 patch.gz cvs-dir" + exit 1 +} + +wantdir() +{ + if [ x$1 = x ] + then + usage + fi + if [ ! -d $1 ] + then + echo "directory $1 does not exist" + usage + fi +} + +wantfile() +{ + if [ x$1 = x ] + then + usage + fi + if [ ! -f $1 ] + then + echo "file $1 does not exist" + usage + fi +} + +doit() +{ + echo $* 1>&2 + $* || { + echo oops + exit 1 + } +} + + +CURRENT_KERNEL=$1 +NEXT_KERNEL=$2 +BASE_KERNEL=$3 +PATCH_FILE=$4 +CVS_DIR=$5 + +TEMP_PATCH=$(mktemp /tmp/patch-XXXXXX) +MY_DIFF="$CURRENT_KERNEL"--"$NEXT_KERNEL" + +wantdir $CURRENT_KERNEL +wantdir $BASE_KERNEL +wantdir $CVS_DIR +wantfile $PATCH_FILE + +doit rm -rf $NEXT_KERNEL +doit cp -a $BASE_KERNEL $NEXT_KERNEL +doit rm -f $TEMP_PATCH +doit gunzip < $PATCH_FILE > $TEMP_PATCH +cd $NEXT_KERNEL +doit patch -p1 --dry-run -i $TEMP_PATCH +doit patch -p1 -s -i $TEMP_PATCH +echo cd .. +cd .. + +echo diff -uNrp $CURRENT_KERNEL $NEXT_KERNEL +diff -uNrp $CURRENT_KERNEL $NEXT_KERNEL > $MY_DIFF + +echo cd $CVS_DIR +cd $CVS_DIR +doit patch -p1 --dry-run -s -i ../$MY_DIFF +doit patch -p1 -s -i ../$MY_DIFF +cvs-take-patch ../$MY_DIFF +cvs commit -m "'doing $NEXT_KERNEL'" +cvs update -ko -d -P + +TAG=$(echo $NEXT_KERNEL | sed -e 's@\.@_@g') +cvs tag -b $TAG +rm -f $TEMP_PATCH diff -Nru a/patches/bin/p0-2-p1 b/patches/bin/p0-2-p1 --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/p0-2-p1 Fri Oct 31 14:10:54 2003 @@ -0,0 +1,10 @@ +#!/bin/sh +# +# Usage: p0-2-p1 olddir newdir +# +OLDDIR=$1 +NEWDIR=$2 + +sed -e "s/^--- \([^\/].*\)/--- $OLDDIR\/\1/" | +sed -e "s/^+++ \([^\/].*\)/+++ $NEWDIR\/\1/" + diff -Nru a/patches/bin/p_diff b/patches/bin/p_diff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/p_diff Fri Oct 31 14:10:54 2003 @@ -0,0 +1,63 @@ +#!/bin/sh + +# +# Bring up a patched file in diff. We show the diffs +# in the topmost patch, unless it was specified +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: pdiff [patchname] filename" + echo " pdiff [patchname] -" + exit 1 +} + +if [ $# == 1 ] +then + PATCH_NAME=$(top_patch) + FILENAME=$1 +elif [ $# == 2 ] +then + PATCH_NAME=$(stripit $1) + FILENAME=$2 +else + usage +fi + +if ! is_applied $PATCH_NAME +then + echo $PATCH_NAME is not applied + exit 1 +fi + +doit() +{ + filename=$1 + unpatched_file=$filename"~"$PATCH_NAME + need_file_there $filename + if [ -e $unpatched_file ] + then + diff -u $unpatched_file $filename + else + echo pdiff: $filename appears to not be in $PATCH_NAME + fi +} + +if [ x"$FILENAME" = "x-" ] +then + FILENAME=$(cat $P/pc/$PATCH_NAME.pc) +fi + +for i in $FILENAME +do + doit $i +done diff -Nru a/patches/bin/patchdesc b/patches/bin/patchdesc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/patchdesc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,24 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +desc1() +{ + PATCH=$(stripit $1) + TXT=$P/txt/$PATCH.txt + echo $PATCH.patch + desc < $TXT + echo +} + +for i in $* +do + desc1 $i +done diff -Nru a/patches/bin/patchfns b/patches/bin/patchfns --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/patchfns Fri Oct 31 14:10:54 2003 @@ -0,0 +1,214 @@ +# +# Work out where the user's pc/, patch/ and txt/ directories live. +# +# If the user specified PATCHSCRIPTS in environment then use that (it's +# probably a relative path) +# +# If there is a directory ./patch-scripts then use that +# +# Otherwise use "." +# + +PATCHSCRIPTS_LIBDIR=$(dirname $0) + +if [ x$PATCHSCRIPTS != x ] +then + P=$PATCHSCRIPTS +elif [ -d ./quilt ] +then + P=./quilt +elif [ -d ./patch-scripts ] +then + P=./patch-scripts +elif [ -d ./patches ] +then + P=. +fi + +DB="$P/applied-patches" + +__cat_series() +{ + grep -v '^#' $1 +} + +cat_series() +{ + __cat_series $P/series +} + +top_patch() +{ + tail -1 $DB +} + +die() +{ + echo error: $* + exit 1 +} + +is_numeric() +{ + if echo $1 | egrep '^[0-9]*$' > /dev/null + then + return 0 + fi + return 1 +} + +is_applied_last() +{ + name="$(stripit $1)" + top_patch >$DB.1 + if grep "^$name$" "$DB.1" > /dev/null 2>&1 + then + rm $DB.1 + return 0 + else + rm $DB.1 + return 1 + fi +} + +is_applied() +{ + name=$(stripit "$1") + if grep "^$name$" "$DB" > /dev/null 2>&1 + then + return 0 + else + return 1 + fi +} + +can_apply() +{ + pn=$(stripit $1) + if patch -p1 --dry-run -i "$P/patches/$pn.patch" -f + then + return 0 + else + return 1 + fi +} + +can_remove() +{ + if patch -R -p1 --dry-run -i $P/patches/"$1".patch -f + then + return 0 + else + return 1 + fi +} + +remove_from_db() +{ + tmpfile=$(mktemp /tmp/p_XXXXXX) + name="$1" + sed -e "/^$name$/d" < "$DB" > $tmpfile + mv $tmpfile "$DB" +} + +stripit() +{ + ret=$(basename $1) + ret=$(echo $ret | sed -e 's/\.patch$//') + ret=$(echo $ret | sed -e 's/\.pc$//') + ret=$(echo $ret | sed -e 's/\.txt$//') + echo $ret +} + +file_in_patch() +{ + file=$1 + patch=$2 + + if [ -e $P/pc/$patch.pc ] + then + if grep "^"$file"$" $P/pc/$patch.pc > /dev/null + then + return 0 + fi + fi + return 1 +} + +# copy_file_to_bup filename patchname +copy_file_to_bup() +{ + file=$1 + patch=$2 + bup="$file"~"$patch" + + if [ -e $bup ] + then + echo "Cannot install file $file in patch $patch: backup $bup exists" + exit 1 + fi + + if [ -e $file ] + then + cp $file "$file"~"$patch" + else + echo "file $file appears to be newly added" + fi +} + +install_file_in_patch() +{ + file=$1 + patch=$2 + + copy_file_to_bup $file $patch + echo $file >> $P/pc/$patch.pc +} + +need_file_there() +{ + if [ ! -e $1 ] + then + echo "File $1 does not exist" + exit 1 + fi +} + +desc() +{ + state=0 + while read x + do + if [ x"$x" = xDESC ] + then + state=1 + elif [ x"$x" = xEDESC ] + then + state=0 + elif [ $state = 1 ] + then + echo " $x" + fi + done +} + +body() +{ + file=$1 + + did_stuff=0 + while read x + do + if [ x"$x" = xEDESC ] + then + cat + did_stuff=1 + fi + done < $file + + if [ $did_stuff = 0 ] + then + cat $file + fi +} + diff -Nru a/patches/bin/pcpatch b/patches/bin/pcpatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/pcpatch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,48 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "pcpatch: recreate the pc file from patches/{patchname}.patch" + exit 1 +} + +doit() +{ + echo $* 1>&2 + $* || { + echo oops + exit 1 + } +} + +if [ $# != 1 -o "$1" = "help" ] +then + usage +fi +PATCH=$1 +PATCH_NAME=$(stripit $PATCH) +PC=$P/pc/$PATCH_NAME.pc + +if [ ! -e $P/patches/$PATCH_NAME.patch ] +then + echo "$P/patches/$PATCH_NAME.patch does not exist" + exit 1 +fi + +if is_applied "$PATCH" +then + echo $PATCH is applied! + exit 1 +fi + +touched-by-patch $P/patches/$PATCH_NAME.patch > $PC +echo Recreated $PC diff -Nru a/patches/bin/poppatch b/patches/bin/poppatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/poppatch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,73 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: poppatch [npatches]" + exit 1 +} + +doit() +{ + echo $* 1>&2 + $* || { + echo oops + exit 1 + } +} + +if [ $# -gt 1 ] +then + usage +fi + +NR=1 +STOP_AT="" +if [ $# -eq 1 ] +then + if is_numeric $1 + then + NR=$1 + else + NR=1000 + STOP_AT=$(stripit $1) + fi +fi + +pop_one() +{ + TOP_PATCH=$(top_patch) + if [ x$TOP_PATCH == x ] + then + echo "no patches applied" + exit 0 + else + popped_patch="$(top_patch)" + if ! $PATCHSCRIPTS_LIBDIR/rpatch $(top_patch) + then + echo still at $(top_patch) + exit 1 + fi + echo + fi +} + +for i in $(seq 1 $NR) +do + pop_one + if [ x$STOP_AT != "x" ] + then + if [ $STOP_AT == $(toppatch) ] + then + exit 0 + fi + fi +done diff -Nru a/patches/bin/prep-patch b/patches/bin/prep-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/prep-patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,22 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +if [ $# -ne 1 ] +then + echo "Usage prep-patch patchname" + exit 1 +fi + +PATCHNAME=$(stripit $1) + +head -2 $P/txt/$PATCHNAME.txt | tail -1 | tr -d '\n' | xcb -s 1 +xcb -p 1; echo +cp $P/patches/$PATCHNAME.patch /tmp/patch diff -Nru a/patches/bin/pstatus b/patches/bin/pstatus --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/pstatus Fri Oct 31 14:10:55 2003 @@ -0,0 +1,159 @@ +#!/bin/sh + +# print out patch status. Usage: pstatus [ patchfile ... ] +# +# Stephen Cameron <steve.cameron@hp.com> +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +if [ ! -f $P/series ] +then + echo "./series does not exist." 1>&2 + exit 1 +fi + +if [ ! -d $P/patches ] +then + echo "Directory ./patches does not exist." 1>&2 + exit 1 +fi + + +PATCHLIST="$*" +if [ "$PATCHLIST" = "" ] +then + series_optimize=yes + PATCHLIST=$(cat_series | sed -e 's/[.]patch[ ]*$//') + SORTSERIES=`mktemp /tmp/ser.XXXXXX` || exit 1 + SORTPATCHES=`mktemp /tmp/pat.XXXXXX` || exit 1 + cat_series | sed -e 's/^[ ]//' -e 's/[.]patch[ ]*$//' | \ + sort > $SORTSERIES + exists="`echo $P/patches/*.patch 2>/dev/null`" + if [ "$exists" != "$P/patches/*.patch" ] + then + ls -1 $P/patches/*.patch | sed -e 's/^.*\/patches\///' \ + -e 's/[.]patch[ ]*$//' | sort > $SORTPATCHES + PATCHLIST="$PATCHLIST"" `comm -1 -3 $SORTSERIES $SORTPATCHES`" + fi + rm -f $SORTPATCHES $SORTSERIES +else + series_optimize=no +fi + +NSERIES=$(cat_series | wc -l | awk '{ print $1; }') +series=1 +for PATCH_NAME in $PATCHLIST +do + PATCH_NAME=$(stripit $PATCH_NAME) + # see if this patch even exists + if [ ! -f $P/patches/"$PATCH_NAME".patch ] + then + echo "$PATCH_NAME does not exist." + continue + fi + # see if this patch is applied + applied="-" + if [ -f $P/applied-patches ] + then + grep '^'"$PATCH_NAME"'$' $P/applied-patches > /dev/null + if [ "$?" = "0" ] + then + applied="a" + fi + fi + + # figure the status of this patch, that is, + # if it needs changelog, pcpatch, refpatch + + stat="" + if [ ! -f $P/txt/"$PATCH_NAME".txt ] + then + stat="changelog " + fi + if [ ! -f $P/pc/"$PATCH_NAME".pc ] + then + stat="$stat""pcpatch " + elif [ "$applied" != '-' ] + then + rpatch=n + + # for each file this patch touches + for y in `cat $P/pc/"$PATCH_NAME".pc` + do + # is the patch adding the file? + if [ ! -e "$y"'~'"$PATCH_NAME" -a -f "$y" ] + then + # file is newer than the patch? + if [ "$y" -nt $P/patches/"$PATCH_NAME".patch ] + then + rpatch=y + stat="$stat""refpatch " + break + fi + else + # modified file is newer than the patch? + if [ "$y"'~'"$PATCH_NAME" -nt \ + $P/patches/"$PATCH_NAME".patch ] + then + rpatch=y + stat="$stat""refpatch " + break + fi + if [ "`$PATCHSCRIPTS_LIBDIR/toppatch`" = "$PATCH_NAME" -a \ + "$y" -nt $P/patches/"$PATCH_NAME".patch ] + then + # toppatch, so check if the file + # is newer than the patch? + rpatch=y + stat="$stat""refpatch " + break + fi + fi + done + fi + # check if they changed the changelog recently + if [ "$rpatch" = "n" -a -f $P/txt/"$PATCH_NAME".txt \ + -a $P/txt/"$PATCH_NAME".txt -nt \ + $P/patches/"$PATCH_NAME".patch ] + then + rpatch=y + stat="$stat""refpatch " + fi + if [ "$stat" != "" ] + then + stat="Needs ""$stat" + fi + + if [ "$series_optimize" != "yes" ] + then + # have to find the series number the hard way. + series=$(cat_series | grep -n '^'"$PATCH_NAME"'\.patch$' |\ + awk -F: '{ printf "%d", $1}' ) + if [ "$series" = "" ] + then + series="?" + fi + fi + + echo "$series":"$applied":"$PATCH_NAME $stat" + + if [ "$series_optimize" = "yes" ] + then + if [ "$series" != "?" ] + then + series=`expr $series + 1` + if [ $series -gt $NSERIES ] + then + series="?" + fi + fi + fi +done diff -Nru a/patches/bin/ptkdiff b/patches/bin/ptkdiff --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/ptkdiff Fri Oct 31 14:10:55 2003 @@ -0,0 +1,49 @@ +#!/bin/sh + +# +# Bring up a patched file in tkdiff. We show the diffs +# in the topmost patch, unless it was specified +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: ptkdiff filename ..." + echo " ptkdiff -" + exit 1 +} + +PATCH_NAME=$(top_patch) + +doit() +{ + filename=$1 + unpatched_file=$filename"~"$PATCH_NAME + need_file_there $filename + if [ -e $unpatched_file ] + then + tkdiff $unpatched_file $filename + else + echo ptkdiff: $filename appears to not be in $PATCH_NAME + fi +} + +if [ x"$1" = "x-" ] +then + FILENAME=$(cat $P/pc/$PATCH_NAME.pc) +else + FILENAME="$*" +fi + +for i in $FILENAME +do + doit $i & +done diff -Nru a/patches/bin/pushpatch b/patches/bin/pushpatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/pushpatch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,85 @@ +#!/bin/sh + +# +# Add next patch in series +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: pushpatch [npatches]" + exit 1 +} + +opt_force=0 + +for i in $* +do + case "$i" in + -f) + opt_force=1;; + *) + if [ -n "$NR" -o -n "$STOP_AT" ] + then + usage + fi + if is_numeric $i + then + NR=$i + else + NR=1000 + STOP_AT=$(stripit $i) + fi;; + esac +done + +[ $opt_force = 1 ] && force="-f" + +if [ ! -e $P/series ] +then + echo 'File "series" not found' + exit 1 +fi + +push_one() +{ + top=$($PATCHSCRIPTS_LIBDIR/toppatch) + if [ x"$top" == x ] + then + todo=$(cat_series | head -1) + else + last_in_series=$(stripit $(cat_series | tail -1)) + if [ $last_in_series == $top ] + then + echo "Series fully applied. Ends at $top" + exit 0 + fi + todo=$(cat_series | grep -C1 "^$top\.patch" | tail -1) + if [ x$todo = x ] + then + todo=$(cat_series | head -1) + fi + fi + + $PATCHSCRIPTS_LIBDIR/apatch $force $todo +} + +for i in $(seq 1 $NR) +do + push_one + if [ x$STOP_AT != "x" ] + then + if [ $STOP_AT == $($PATCHSCRIPTS_LIBDIR/toppatch) ] + then + exit 0 + fi + fi +done diff -Nru a/patches/bin/refpatch b/patches/bin/refpatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/refpatch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,34 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: refpatch" + exit 1 +} + +doit() +{ + echo $* 1>&2 + $* || { + echo oops + exit 1 + } +} + +if [ $# != 0 ] +then + usage +fi + +TOP_PATCH=$(top_patch) +$PATCHSCRIPTS_LIBDIR/mpatch $* $(top_patch) +echo "Refreshed $TOP_PATCH" diff -Nru a/patches/bin/removed-by-patch b/patches/bin/removed-by-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/removed-by-patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,14 @@ +#!/bin/sh +# Extract names of new files from a patch, print them out + +PATCHFILE=$1 +case "$PATCHFILE" in +*.gz) CMD="gzip -d < $PATCHFILE";; +*) CMD="cat $PATCHFILE";; +esac + +TMP=$(mktemp /tmp/rbp-XXXXXX) + +eval $CMD | egrep '^\+\+\+.*1970|\+\+\+.*1969' > $TMP +sed -e 's@[^/]*/\([^ ]*\).*@\1@' < $TMP | sed -e 's@^linux/@@' | sort +rm -f $TMP diff -Nru a/patches/bin/rename-patch b/patches/bin/rename-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/rename-patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,35 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +if [ $# -eq 1 ] +then + OLD=$(stripit $1) + NEW=_$OLD +elif [ $# -eq 2 ] +then + OLD=$(stripit $1) + NEW=$(stripit $2) +else + echo "Usage: rename-patch old [new]" + exit 1 +fi + +mv $P/pc/$OLD.pc $P/pc/$NEW.pc +mv $P/patches/$OLD.patch $P/patches/$NEW.patch +mv $P/txt/$OLD.txt $P/txt/$NEW.txt + +cvs remove $P/pc/$OLD.pc +cvs remove $P/patches/$OLD.patch +cvs remove $P/txt/$OLD.txt + +cvs add $P/pc/$NEW.pc +cvs add $P/patches/$NEW.patch +cvs add $P/txt/$NEW.txt diff -Nru a/patches/bin/rolled-up-patch b/patches/bin/rolled-up-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/rolled-up-patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,33 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: rolled-up-patch" + exit 1 +} + +if [ $# != 0 ] +then + usage +fi + +RUP=$(mktemp /tmp/rup-XXXXXX) +rm -f $RUP + +for i in $(cat applied-patches) +do + patch_name=$(stripit $i) + cat $P/pc/$patch_name.pc +done | sort | uniq > $RUP + +kdiff $(cat $RUP) +rm -f $RUP diff -Nru a/patches/bin/rpatch b/patches/bin/rpatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/rpatch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,70 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +do_remove() +{ + if patch -R -p1 -s -i $P/patches/"$1".patch + then + true + else + echo SOMETHING WENT WRONG + exit 1 + fi +} + +kill_old_ones() +{ + FILES=$(cat $P/pc/$1.pc) + for file in $FILES + do + rm -f "$file"~"$1" + done +} + +usage() +{ + echo "Usage: rpatch patchname" + exit 1 +} + +if [ $# == 0 ] +then + usage +fi + +PATCH_NAME=$(stripit $1) + +if is_applied "$PATCH_NAME" +then + if can_remove "$PATCH_NAME" + then + do_remove "$PATCH_NAME" + kill_old_ones "$PATCH_NAME" + remove_from_db "$PATCH_NAME" + else + echo "$PATCH_NAME" does not remove cleanly + exit 1 + fi +else + echo "$PATCH_NAME" is not applied + exit 1 +fi + +top=$(top_patch) +if [ x"$top" == x ] +then + msg="no patches applied" +else + msg="now at $top" +fi + +echo Removed $PATCH_NAME, $msg + diff -Nru a/patches/bin/split-patch b/patches/bin/split-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/split-patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,29 @@ +#!/usr/bin/perl -w +$out = ""; +while (<>) { + next if (/^Only/); + next if (/^Binary/); + if (/^diff/ || /^Index/) { + if ($out) { + close OUT; + } + (@out) = split(' ', $_); + shift(@out) if (/^diff/); + $out = pop(@out); + $out =~ s:/*usr/:/:; + $out =~ s:/*src/:/:; + $out =~ s:^/*linux[^/]*::; + $out =~ s:\(w\)::; + next if ($out eq ""); + $out = "/var/tmp/patches/$out"; + $dir = $out; + $dir =~ s:/[^/]*$::; + print STDERR "$out\n"; + system("mkdir -p $dir"); + open(OUT, ">$out") || die("cannot open $out"); + } + if ($out) { + print OUT $_; + } +} + diff -Nru a/patches/bin/stripspace b/patches/bin/stripspace --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/stripspace Fri Oct 31 14:10:54 2003 @@ -0,0 +1,24 @@ +#!/usr/bin/perl +# +# Strip whitespace on the end of GNU unified diff insertion lines. +# +# Usage: +# cat foo | stripspace | patch -p1 +# stripspace foo | patch -p1 +# +# Code fragment taken from the ARM Linux patch system, Copyright +# (C) Russell King, All Rights Reserved. +# +# This script: +# +# Copyright (C) 2002 Russell King, All Rights Reserved. +# +# This script is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. +# +while (<>) { + chomp; + s/\s+$// if m/^\+/; + print "$_\n"; +} diff -Nru a/patches/bin/tag-series b/patches/bin/tag-series --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/tag-series Fri Oct 31 14:10:54 2003 @@ -0,0 +1,44 @@ +#!/bin/sh + +# tag-series tagname series-file-name +# +# Does a `cvs tag tagname' of all the .pc, .txt and .patch files mentioned +# in series-file-name. Also tags series-file-name. +# + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +# tag_one tag patchname +# +tag_one() +{ + PN=$(stripit $2) + if [ -r $P/txt/$PN.txt ] + then + cvs tag $1 $P/pc/$PN.pc $P/patches/$PN.patch $P/txt/$PN.txt + else + cvs tag $1 $P/pc/$PN.pc $P/patches/$PN.patch + fi +} + +if [ $# -ne 2 ] +then + echo Usage: tag-series tagname series-file-name + exit 1 +fi + +TAG=$1 +SERIES=$2 + +for p in $(__cat_series $SERIES) +do + tag_one $TAG $p +done +cvs tag $TAG $SERIES diff -Nru a/patches/bin/toppatch b/patches/bin/toppatch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/toppatch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,30 @@ +#!/bin/sh + +. patchfns 2>/dev/null || +. /usr/lib/patch-scripts/patchfns 2>/dev/null || +. $PATCHSCRIPTS_LIBDIR/patchfns 2>/dev/null || +{ + echo "Impossible to find my library 'patchfns'." + echo "Check your install, or go to the right directory" + exit 1 +} + +usage() +{ + echo "Usage: toppatch" + exit 1 +} + +if [ $# != 0 ] +then + usage +fi + +if [ -e $DB ] +then + TOP_PATCH=$(top_patch) + if [ x$TOP_PATCH != x ] + then + echo $TOP_PATCH + fi +fi diff -Nru a/patches/bin/touched-by-patch b/patches/bin/touched-by-patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/touched-by-patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,32 @@ +#!/bin/sh +# Extract names of new files from a patch, print them out + +PATCHFILE=$1 +case "$PATCHFILE" in +*.gz) CMD="gzip -d < $PATCHFILE";; +*) CMD="cat $PATCHFILE";; +esac + +TMP=$(mktemp /tmp/tbp-XXXXXX) || exit 1 +TMP2=$(mktemp /tmp/tbp2-XXXXXX) || exit 1 + +eval $CMD | egrep '^\+\+\+ |^\-\-\- ' > $TMP + +cat $TMP | sed -e 's@[^/]*/\([^ ]*\).*@\1@' \ + | grep -v '^dev\/null$' \ + | sort \ + | uniq \ + > $TMP2 + +rm -f $TMP +grep < $TMP2 '^[+][+][+]' > /dev/null +if [ "$?" = "0" ] +then + echo "WARNING: $PATCHFILE appears to be -p0 form rather than -p1." 1>&2 + echo " Use "\'"p0-2-p1 . . < $PATCHFILE"\'" to fix" 1>&2 + awk '{ print $2 }' < $TMP2 +else + cat $TMP2 +fi | grep -v '~' + +rm -f $TMP2 diff -Nru a/patches/bin/unitdiff.py b/patches/bin/unitdiff.py --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/bin/unitdiff.py Fri Oct 31 14:10:55 2003 @@ -0,0 +1,223 @@ +#!/usr/bin/python + +import sys +import re +import string + +#TODO +# clean up rest/file +# clean up +6 and like (assumptions). should be turned into 'find' +# make regession tests for all cases (Only in, etc) + +try: + filename = sys.argv[1] +except: + print 'requires a file name' + sys.exit(1) + +filefd = open(filename) +file = filefd.read() +filefd.close() + +rest = file +pat = "(^(?:diff .*\n)?--- .*\n\+\+\+ .*)?\n@@ -(\d+),?(\d+)? \+(\d+),?(\d+)? @@|^(Only in .*)" +startpat = re.compile(pat, re.M) + +pos = 0 +oldpos = 0 +filelen = len(rest) +oldrest = "" +while(1): + rexp = startpat.search(rest) + if not rexp: + break + + if rexp.group(6): + print rexp.group(6) + rest = rest[rexp.end(6)+1:] + continue + + header = rexp.group(1) + orgfile_start = string.atoi(rexp.group(2)) + if rexp.group(3): + orgfile_len = string.atoi(rexp.group(3)) + else: + orgfile_len = -1 + newfile_start = string.atoi(rexp.group(4)) + if rexp.group(5): + newfile_len = string.atoi(rexp.group(5)) + else: + newfile_len = -1 + rest = rest[rexp.start(2):] + rest = rest[string.find(rest, "\n")+1:] + + rexp2 = startpat.search(rest) + if rexp2: + if rexp2.start(6) != -1: + oldrest = rest[rexp2.start(6)-1:] + rest = rest[:rexp2.start(6)] + elif rexp2.start(1) == -1: + oldrest = rest[rexp2.start(2)-5:] + rest = rest[:rexp2.start(2)-4] + else: + oldrest = rest[rexp2.start(1)-1:] + rest = rest[:rexp2.start(1)] + else: + oldrest = rest + +# pos = filelen - len(oldrest) +# if pos - oldpos > 100: +# sys.stderr.write(`pos`+'/'+`filelen`+'\n') +# oldpos = pos + + first = 1 + oldminuses = 0 + oldplusses = 0 + oldoffset = 0 + while(1): + #erstat early line stuff med lookbehind paa {1,2}-dims + #nedenfor RAA + linepat = "^([^-+\n]*)\n?(((^[-+].*\n)|^(.*\n){1,2}(?=^[-+].*\n))+)(.*)\n?" + compat = re.compile(linepat, re.M) + rexp = compat.search(rest) + if not rexp: + break + + prematch = rexp.group(1) + match = rexp.group(2) + muddle = len(match) + +# print rest +# print 'prematch ', rexp.start(1), rexp.end(1), prematch +# print 'match ---------' +# print match +# print 'match --------' + + # dump unwanted early lines... + if match[0] != "+" and match[0] != "-": + while(1): + next = string.find(match, '\n') + if next == -1: + break + if match[next+1] == "+" or match[next+1] == "-": + prematch = match[:next] + match = match[next+1:] + break + match = match[next+1:] + + +# print 'prematch ', rexp.start(1), rexp.end(1), len(prematch) +# print '('+prematch+')' +# if prematch == ' ': +# print 'space' + muddle = muddle - len(match) + + lines = string.count(match, "\n") + compat = re.compile("^-", re.M) + minuses = len(compat.findall(match)) + compat = re.compile("^\+", re.M) + plusses = len(compat.findall(match)) + orgsize = minuses + 2 + (lines - minuses - plusses) + newsize = plusses + 2 + (lines - minuses - plusses) + + noeol = "^(\\\ No newline at end of file)$" + compnoeol = re.compile(noeol, re.M) + if compnoeol.search(match) or compnoeol.search(rexp.group(6)): + orgsize = orgsize - 1 + newsize = newsize - 1 + + coherent = 0 + if lines - plusses == 0: + coherent = 1 + elif lines - minuses == 0: + coherent = 1 + + # RAA FIXME + if not len(prematch):#or len(prematch) == 1 and prematch == ' ': + orgsize = orgsize -1 + newsize = newsize -1 + if rexp.start(6) == rexp.end(6): + orgsize = orgsize -1 + newsize = newsize -1 + +# print "lines in match: ", lines +# print "number of minuses: ", minuses +# print "number of plusses: ", plusses + + matchpos = rexp.start(2) + muddle + offset = string.count(rest[:matchpos], "\n") + +# print 'offset/oldoffset: ', offset,oldoffset +# print 'oldplusses/oldminuses: ', oldplusses, oldminuses +# print 'orgfile_start/newfile_start: ', orgfile_start, newfile_start + + orgstart = orgfile_start + offset + oldoffset - oldplusses + newstart = newfile_start + offset - oldminuses + oldoffset + + # RAA: Bwadr. Fix antagelse om prematch paa en anden + # maade + orgstartmod = 0 + newstartmod = 0 + if orgfile_start == 1 and not len(prematch): + orgstartmod = 1 + if newfile_start == 1 and not len(prematch): + newstartmod = 1 + if orgfile_start == 0 and orgfile_len == 0: + orgstartmod = 1 + # RAA Hack! + plusses = plusses + 1 + minuses = minuses +1 + if newfile_start == 0 and newfile_len == 0: + newstartmod = 1 + # RAA Hack! + plusses = plusses + 1 + minuses = minuses +1 + + if header and first: + print header + first = 0 + + # should the start(1) == 0 be orgstart == 1? RAA + if orgstart == 1 and newstart == 1 and plusses == 0 and coherent: + print "@@ -"+`orgstart`+","+`orgsize`+" +"+`newstart`+" @@" + print match[:string.rfind(match, "\n")] + print rexp.group(6) + elif rexp.start(6) == rexp.end(6) and plusses == 0 and coherent: + if orgstartmod: + orgstart = orgstart + 1 + if newstartmod: + newstart = newstart + 1 + print "@@ -"+`orgstart-1`+","+`orgsize`+" +"+`newstart-1`+" @@" + print prematch + print match[:string.rfind(match, "\n")] + elif orgstart == 1 and orgstart == 1 and minuses == 0 and coherent: + print "@@ -"+`orgstart`+" +"+`newstart`+","+`newsize`+" @@" + print match[:string.rfind(match, "\n")] + print rexp.group(6) + elif rexp.start(6) == rexp.end(6) and minuses == 0 and coherent: + if orgstartmod: + orgstart = orgstart + 1 + if newstartmod: + newstart = newstart + 1 + print "@@ -"+`orgstart-1`+" +"+`newstart-1`+","+`newsize`+" @@" + print prematch + print match[:string.rfind(match, "\n")] + else: + if orgstartmod: + orgstart = orgstart + 1 + if newstartmod: + newstart = newstart + 1 + print "@@ -"+`orgstart-1`+","+`orgsize`+" +"+`newstart-1`+","+`newsize`+" @@" + if len(prematch): + print prematch + print match[:string.rfind(match, "\n")] + if rexp.start(6) != rexp.end(6): + print rexp.group(6) + + rest = rest[rexp.end(6):] + oldminuses = minuses + oldminuses + oldplusses = plusses + oldplusses + oldoffset = oldoffset + offset + lines #include match()-lines + + + rest = oldrest diff -Nru a/patches/patches/all-sources.diff.patch b/patches/patches/all-sources.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/all-sources.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,15 @@ + +diff -puN Makefile~all-sources.diff Makefile +--- limbo/Makefile~all-sources.diff Tue Oct 21 16:42:36 2003 ++++ limbo-god/Makefile Tue Oct 21 16:42:36 2003 +@@ -805,7 +805,7 @@ mrproper distclean: clean archmrproper + # --------------------------------------------------------------------------- + + define all-sources +- ( find . $(RCS_FIND_IGNORE) \ ++ ( find init kernel mm fs ipc lib drivers/block security arch -follow $(RCS_FIND_IGNORE) -name ulevel -prune -o \ + \( -name include -o -name arch \) -prune -o \ + -name '*.[chS]' -print; \ + find arch/$(ARCH) $(RCS_FIND_IGNORE) \ + +_ diff -Nru a/patches/patches/do_mmap2-fix.diff.patch b/patches/patches/do_mmap2-fix.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/do_mmap2-fix.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,97 @@ + +diff -puN arch/i386/kernel/sys_i386.c~do_mmap2-fix.diff arch/i386/kernel/sys_i386.c +--- limbo/arch/i386/kernel/sys_i386.c~do_mmap2-fix.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/i386/kernel/sys_i386.c Tue Oct 21 16:42:37 2003 +@@ -56,7 +56,7 @@ static inline long do_mmap2( + } + + down_write(¤t->mm->mmap_sem); +- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); ++ error = do_mmap_pgoff(current->mm, file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) +diff -puN include/linux/mm.h~do_mmap2-fix.diff include/linux/mm.h +--- limbo/include/linux/mm.h~do_mmap2-fix.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/include/linux/mm.h Tue Oct 21 16:42:37 2003 +@@ -487,6 +487,9 @@ static inline int set_page_dirty(struct + return __set_page_dirty_buffers(page); + } + ++extern long do_mprotect(struct mm_struct *mm, unsigned long start, ++ size_t len, unsigned long prot); ++ + /* + * On a two-level page table, this ends up being trivial. Thus the + * inlining and the symmetry break with pte_alloc_map() that does all +@@ -517,9 +520,10 @@ extern void exit_mmap(struct mm_struct * + + extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, +- unsigned long len, unsigned long prot, +- unsigned long flag, unsigned long pgoff); ++extern unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file *file, ++ unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flag, ++ unsigned long pgoff); + + static inline unsigned long do_mmap(struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, +@@ -529,7 +533,8 @@ static inline unsigned long do_mmap(stru + if ((offset + PAGE_ALIGN(len)) < offset) + goto out; + if (!(offset & ~PAGE_MASK)) +- ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); ++ ret = do_mmap_pgoff(current->mm, file, addr, len, prot, flag, ++ offset >> PAGE_SHIFT); + out: + return ret; + } +diff -puN mm/mmap.c~do_mmap2-fix.diff mm/mmap.c +--- limbo/mm/mmap.c~do_mmap2-fix.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/mm/mmap.c Tue Oct 21 16:42:37 2003 +@@ -434,11 +434,11 @@ static int vma_merge(struct mm_struct *m + * The caller must hold down_write(current->mm->mmap_sem). + */ + +-unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, +- unsigned long len, unsigned long prot, +- unsigned long flags, unsigned long pgoff) ++unsigned long do_mmap_pgoff(struct mm_struct *mm, struct file * file, ++ unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, ++ unsigned long pgoff) + { +- struct mm_struct * mm = current->mm; + struct vm_area_struct * vma, * prev; + struct inode *inode; + unsigned int vm_flags; +diff -puN mm/mprotect.c~do_mmap2-fix.diff mm/mprotect.c +--- limbo/mm/mprotect.c~do_mmap2-fix.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/mm/mprotect.c Tue Oct 21 16:42:37 2003 +@@ -222,7 +222,8 @@ fail: + } + + asmlinkage long +-sys_mprotect(unsigned long start, size_t len, unsigned long prot) ++do_mprotect(struct mm_struct *mm, unsigned long start, size_t len, ++ unsigned long prot) + { + unsigned long vm_flags, nstart, end, tmp; + struct vm_area_struct * vma, * next, * prev; +@@ -326,6 +327,11 @@ sys_mprotect(unsigned long start, size_t + prev->vm_mm->map_count--; + } + out: +- up_write(¤t->mm->mmap_sem); ++ up_write(&mm->mmap_sem); + return error; + } ++ ++asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot) ++{ ++ return(do_mprotect(current->mm, start, len, prot)); ++} + +_ diff -Nru a/patches/patches/export-generic_forget_inode.diff.patch b/patches/patches/export-generic_forget_inode.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/export-generic_forget_inode.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,34 @@ + +diff -puN fs/inode.c~export-generic_forget_inode.diff fs/inode.c +--- limbo/fs/inode.c~export-generic_forget_inode.diff Tue Oct 21 16:42:42 2003 ++++ limbo-god/fs/inode.c Tue Oct 21 16:42:42 2003 +@@ -1012,7 +1012,7 @@ void generic_delete_inode(struct inode * + + EXPORT_SYMBOL(generic_delete_inode); + +-static void generic_forget_inode(struct inode *inode) ++void generic_forget_inode(struct inode *inode) + { + struct super_block *sb = inode->i_sb; + +@@ -1039,6 +1039,7 @@ static void generic_forget_inode(struct + clear_inode(inode); + destroy_inode(inode); + } ++EXPORT_SYMBOL(generic_forget_inode); + + /* + * Normal UNIX filesystem behaviour: delete the +diff -puN include/linux/fs.h~export-generic_forget_inode.diff include/linux/fs.h +--- limbo/include/linux/fs.h~export-generic_forget_inode.diff Tue Oct 21 16:42:42 2003 ++++ limbo-god/include/linux/fs.h Tue Oct 21 16:42:42 2003 +@@ -1248,6 +1248,7 @@ extern struct inode * igrab(struct inode + extern ino_t iunique(struct super_block *, ino_t); + extern int inode_needs_sync(struct inode *inode); + extern void generic_delete_inode(struct inode *inode); ++extern void generic_forget_inode(struct inode *inode); + + extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, + int (*test)(struct inode *, void *), void *data); + +_ diff -Nru a/patches/patches/export-page_cache_readahead.diff.patch b/patches/patches/export-page_cache_readahead.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/export-page_cache_readahead.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,14 @@ + +diff -puN mm/readahead.c~export-page_cache_readahead.diff mm/readahead.c +--- limbo/mm/readahead.c~export-page_cache_readahead.diff Tue Oct 21 16:42:46 2003 ++++ limbo-god/mm/readahead.c Tue Oct 21 16:42:46 2003 +@@ -464,6 +464,7 @@ do_io: + out: + return; + } ++EXPORT_SYMBOL(page_cache_readahead); + + + /* + +_ diff -Nru a/patches/patches/export-remove_from_page_cache.diff.patch b/patches/patches/export-remove_from_page_cache.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/export-remove_from_page_cache.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,14 @@ + +diff -puN mm/filemap.c~export-remove_from_page_cache.diff mm/filemap.c +--- limbo/mm/filemap.c~export-remove_from_page_cache.diff Tue Oct 21 16:42:45 2003 ++++ limbo-god/mm/filemap.c Tue Oct 21 16:42:45 2003 +@@ -109,6 +109,7 @@ void remove_from_page_cache(struct page + __remove_from_page_cache(page); + spin_unlock(&mapping->page_lock); + } ++EXPORT_SYMBOL(remove_from_page_cache); + + static inline int sync_page(struct page *page) + { + +_ diff -Nru a/patches/patches/fs_activation.diff.patch b/patches/patches/fs_activation.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/fs_activation.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,150 @@ + +diff -puN fs/jbd/transaction.c~fs_activation.diff fs/jbd/transaction.c +--- limbo/fs/jbd/transaction.c~fs_activation.diff Tue Oct 21 16:42:47 2003 ++++ limbo-god/fs/jbd/transaction.c Tue Oct 21 16:42:47 2003 +@@ -108,6 +108,7 @@ alloc_transaction: + + jbd_debug(3, "New handle %p going live.\n", handle); + ++ handle->h_journal = journal; + repeat: + + /* +@@ -245,6 +246,23 @@ static handle_t *new_handle(int nblocks) + return handle; + } + ++/* ++ * push @handle into ->fs_context stack ++ */ ++static void push_handle(handle_t *handle) ++{ ++ handle->h_parent = current->fs_context; ++ current->fs_context = (struct fs_activation *) handle; ++} ++ ++/* ++ * pop top of ->fs_context stack ++ */ ++static void pop_handle(handle_t *handle) ++{ ++ current->fs_context = (struct fs_activation *) handle->h_parent; ++} ++ + /** + * handle_t *journal_start() - Obtain a new handle. + * @journal: Journal to start transaction on. +@@ -267,7 +285,7 @@ handle_t *journal_start(journal_t *journ + if (!journal) + return ERR_PTR(-EROFS); + +- if (handle) { ++ if (handle && handle->h_journal == journal) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; +@@ -277,12 +295,13 @@ handle_t *journal_start(journal_t *journ + if (!handle) + return ERR_PTR(-ENOMEM); + +- current->journal_info = handle; ++ push_handle(handle); + + err = start_this_handle(journal, handle); + if (err < 0) { ++ kfree(handle); ++ pop_handle(handle); + jbd_free_handle(handle); +- current->journal_info = NULL; + handle = ERR_PTR(err); + } + return handle; +@@ -1357,7 +1376,7 @@ int journal_stop(handle_t *handle) + } while (old_handle_count != transaction->t_handle_count); + } + +- current->journal_info = NULL; ++ pop_handle(handle); + spin_lock(&journal->j_state_lock); + spin_lock(&transaction->t_handle_lock); + transaction->t_outstanding_credits -= handle->h_buffer_credits; +diff -puN include/linux/init_task.h~fs_activation.diff include/linux/init_task.h +--- limbo/include/linux/init_task.h~fs_activation.diff Tue Oct 21 16:42:47 2003 ++++ limbo-god/include/linux/init_task.h Tue Oct 21 16:42:47 2003 +@@ -107,7 +107,7 @@ + .alloc_lock = SPIN_LOCK_UNLOCKED, \ + .proc_lock = SPIN_LOCK_UNLOCKED, \ + .switch_lock = SPIN_LOCK_UNLOCKED, \ +- .journal_info = NULL, \ ++ .fs_context = NULL, \ + } + + +diff -puN include/linux/jbd.h~fs_activation.diff include/linux/jbd.h +--- limbo/include/linux/jbd.h~fs_activation.diff Tue Oct 21 16:42:47 2003 ++++ limbo-god/include/linux/jbd.h Tue Oct 21 16:42:47 2003 +@@ -384,6 +384,14 @@ struct jbd_revoke_table_s; + + struct handle_s + { ++ /* Which journal this handle belongs to. This has to be first ++ * field, because current->fs_context points here. */ ++ journal_t * h_journal; ++ ++ /* Previous file system context. NULL if we are top-most ++ * call. */ ++ struct fs_activation * h_parent; ++ + /* Which compound transaction is this update a part of? */ + transaction_t *h_transaction; + +@@ -876,7 +884,7 @@ extern void __wait_on_journal (journal_ + + static inline handle_t *journal_current_handle(void) + { +- return current->journal_info; ++ return (handle_t*) current->fs_context; + } + + /* The journaling code user interface: +diff -puN include/linux/sched.h~fs_activation.diff include/linux/sched.h +--- limbo/include/linux/sched.h~fs_activation.diff Tue Oct 21 16:42:47 2003 ++++ limbo-god/include/linux/sched.h Tue Oct 21 16:42:47 2003 +@@ -308,6 +308,24 @@ typedef struct prio_array prio_array_t; + struct backing_dev_info; + struct reclaim_state; + ++/* ++ * Some file systems need context associated with current thread during ++ * one system call (transaction handle, for example). This context in ++ * attached to current->fs_context. ++ * ++ * As it is possible for file system calls to nest (through quota of VM ++ * call backs), every file system using current->fs_context should store ++ * original ->fs_context value of entrance and restore in on exit. ++ */ ++struct fs_activation { ++ /* ++ * cookie allowing to distinguish file system instances ++ * (mounts). Usually this is pointer to the super block, but not ++ * necessary. This is used to tell reentrance. ++ */ ++ void *owner; ++}; ++ + /* POSIX.1b interval timer structure. */ + struct k_itimer { + struct list_head list; /* free/ allocate list */ +@@ -451,8 +469,8 @@ struct task_struct { + /* context-switch lock */ + spinlock_t switch_lock; + +-/* journalling filesystem info */ +- void *journal_info; ++/* info about current file system activation */ ++ struct fs_activation *fs_context; + + /* VM state */ + struct reclaim_state *reclaim_state; + +_ diff -Nru a/patches/patches/fsync_super.diff.patch b/patches/patches/fsync_super.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/fsync_super.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,14 @@ + +diff -puN fs/buffer.c~fsync_super.diff fs/buffer.c +--- limbo/fs/buffer.c~fsync_super.diff Tue Oct 21 16:42:40 2003 ++++ limbo-god/fs/buffer.c Tue Oct 21 16:42:40 2003 +@@ -239,6 +239,7 @@ int fsync_super(struct super_block *sb) + + return sync_blockdev(sb->s_bdev); + } ++EXPORT_SYMBOL(fsync_super); + + /* + * Write out and wait upon all dirty data associated with this + +_ diff -Nru a/patches/patches/i386-sys_reiser4.diff.patch b/patches/patches/i386-sys_reiser4.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/i386-sys_reiser4.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,39 @@ + +diff -puN arch/i386/kernel/entry.S~i386-sys_reiser4.diff arch/i386/kernel/entry.S +--- limbo/arch/i386/kernel/entry.S~i386-sys_reiser4.diff Tue Oct 21 16:42:36 2003 ++++ limbo-god/arch/i386/kernel/entry.S Tue Oct 21 16:42:36 2003 +@@ -880,5 +880,10 @@ ENTRY(sys_call_table) + .long sys_utimes + .long sys_fadvise64_64 + .long sys_ni_syscall /* sys_vserver */ ++#ifdef CONFIG_REISER4_FS ++ .long sys_reiser4 ++#else ++ .long sys_ni_syscall ++#endif + + nr_syscalls=(.-sys_call_table)/4 +diff -puN include/asm-i386/unistd.h~i386-sys_reiser4.diff include/asm-i386/unistd.h +--- limbo/include/asm-i386/unistd.h~i386-sys_reiser4.diff Tue Oct 21 16:42:36 2003 ++++ limbo-god/include/asm-i386/unistd.h Tue Oct 21 16:42:36 2003 +@@ -279,8 +279,9 @@ + #define __NR_utimes 271 + #define __NR_fadvise64_64 272 + #define __NR_vserver 273 ++#define __NR_reiser4 274 + +-#define NR_syscalls 274 ++#define NR_syscalls 275 + + /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */ + +@@ -396,6 +397,7 @@ static inline _syscall3(int,open,const c + static inline _syscall1(int,close,int,fd) + static inline _syscall1(int,_exit,int,exitcode) + static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) ++static inline _syscall1(long,reiser4,char*,p_strIng) + + #endif + + +_ diff -Nru a/patches/patches/init_fixmap_vma.diff.patch b/patches/patches/init_fixmap_vma.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/init_fixmap_vma.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,61 @@ + +diff -puN mm/memory.c~init_fixmap_vma.diff mm/memory.c +--- limbo/mm/memory.c~init_fixmap_vma.diff Tue Oct 21 16:42:45 2003 ++++ limbo-god/mm/memory.c Tue Oct 21 16:42:45 2003 +@@ -44,6 +44,7 @@ + #include <linux/highmem.h> + #include <linux/pagemap.h> + #include <linux/rmap-locking.h> ++#include <linux/init.h> + #include <linux/module.h> + + #include <asm/pgalloc.h> +@@ -680,6 +681,24 @@ static inline struct page *get_page_map( + } + + ++static struct vm_area_struct fixmap_vma = { ++ /* Catch users - if there are any valid ++ ones, we can make this be "&init_mm" or ++ something. */ ++ .vm_mm = NULL, ++ .vm_page_prot = PAGE_READONLY, ++ .vm_flags = VM_READ | VM_EXEC, ++}; ++ ++static int init_fixmap_vma(void) ++{ ++ fixmap_vma.vm_start = FIXADDR_START; ++ fixmap_vma.vm_end = FIXADDR_TOP; ++ return(0); ++} ++ ++__initcall(init_fixmap_vma); ++ + int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +@@ -700,19 +719,8 @@ int get_user_pages(struct task_struct *t + + vma = find_extend_vma(mm, start); + +-#ifdef FIXADDR_USER_START +- if (!vma && +- start >= FIXADDR_USER_START && start < FIXADDR_USER_END) { +- static struct vm_area_struct fixmap_vma = { +- /* Catch users - if there are any valid +- ones, we can make this be "&init_mm" or +- something. */ +- .vm_mm = NULL, +- .vm_start = FIXADDR_USER_START, +- .vm_end = FIXADDR_USER_END, +- .vm_page_prot = PAGE_READONLY, +- .vm_flags = VM_READ | VM_EXEC, +- }; ++#ifdef FIXADDR_START ++ if (!vma && start >= FIXADDR_START && start < FIXADDR_TOP) { + unsigned long pg = start & PAGE_MASK; + pgd_t *pgd; + pmd_t *pmd; + +_ diff -Nru a/patches/patches/page-owner.diff.patch b/patches/patches/page-owner.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/page-owner.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,70 @@ + +diff -puN include/linux/mm.h~page-owner.diff include/linux/mm.h +--- limbo/include/linux/mm.h~page-owner.diff Tue Oct 21 16:42:44 2003 ++++ limbo-god/include/linux/mm.h Tue Oct 21 16:42:44 2003 +@@ -197,6 +197,9 @@ struct page { + void *virtual; /* Kernel virtual address (NULL if + not kmapped, ie. highmem) */ + #endif /* WANT_PAGE_VIRTUAL */ ++#ifdef CONFIG_REISER4_DEBUG ++ void *owner; ++#endif + }; + + /* +diff -puN include/linux/page-flags.h~page-owner.diff include/linux/page-flags.h +--- limbo/include/linux/page-flags.h~page-owner.diff Tue Oct 21 16:42:44 2003 ++++ limbo-god/include/linux/page-flags.h Tue Oct 21 16:42:44 2003 +@@ -141,14 +141,48 @@ extern void get_full_page_state(struct p + */ + #define PageLocked(page) \ + test_bit(PG_locked, &(page)->flags) +-#define SetPageLocked(page) \ +- set_bit(PG_locked, &(page)->flags) ++ ++#ifdef CONFIG_REISER4_DEBUG ++#define TestSetPageLocked(page) \ ++({ \ ++ int ret; \ ++ \ ++ ret = test_and_set_bit(PG_locked, &(page)->flags); \ ++ if (!ret) \ ++ page->owner = current; \ ++ ret; \ ++}) ++ ++#define ClearPageLocked(page) \ ++({ \ ++ clear_bit(PG_locked, &(page)->flags); \ ++ page->owner = NULL; \ ++}) ++ ++#define TestClearPageLocked(page) \ ++({ \ ++ page->owner = NULL; \ ++ test_and_clear_bit(PG_locked, &(page)->flags); \ ++}) ++ ++#define SetPageLocked(page) \ ++({ \ ++ \ ++ set_bit(PG_locked, &(page)->flags); \ ++ page->owner = current; \ ++}) ++ ++#else ++ + #define TestSetPageLocked(page) \ + test_and_set_bit(PG_locked, &(page)->flags) + #define ClearPageLocked(page) \ + clear_bit(PG_locked, &(page)->flags) + #define TestClearPageLocked(page) \ + test_and_clear_bit(PG_locked, &(page)->flags) ++#define SetPageLocked(page) \ ++ set_bit(PG_locked, &(page)->flags) ++#endif + + #define PageError(page) test_bit(PG_error, &(page)->flags) + #define SetPageError(page) set_bit(PG_error, &(page)->flags) + +_ diff -Nru a/patches/patches/reget-page-mapping.diff.patch b/patches/patches/reget-page-mapping.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/reget-page-mapping.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,51 @@ + +reiser4 keeps its meta-data pages in the page cache, attached to a special +"fake" inode. Meta-data blocks have "znodes" attached to them (reiser4 analog +of buffer_head) and initially don't have real disk block numbers +assigned. Later meta-data blocks can be "relocated" to decrease +fragmentation. As a result, their pages cannot be easily indexed by block +number. Instead reiser4 indexes pages of fake inode by some function of znode +address. This looks weird, but it works. The only problem is that there is a +race involving ->releasepage(): there is a window when znode has already been +freed by reiser4_releasepage(), but its page still exists (albeit locked). If +at this moment another znode is allocated at the same address as one just +destroyed, then some other thread can acquire a reference to lingering page +(because it is indexed by address of znode), and prevent shrink_list() from +freeing it. + +To avoid this, reiser4_releasepage() removes page from radix-tree +manually. This requires re-checking page->mapping after calling +try_to_release_page(). + + + +diff -puN mm/vmscan.c~reget-page-mapping.diff mm/vmscan.c +--- limbo/mm/vmscan.c~reget-page-mapping.diff Tue Oct 21 16:43:13 2003 ++++ limbo-god/mm/vmscan.c Tue Oct 21 16:43:28 2003 +@@ -412,6 +412,11 @@ shrink_list(struct list_head *page_list, + if (PagePrivate(page)) { + if (!try_to_release_page(page, gfp_mask)) + goto activate_locked; ++ /* ++ * file system may manually remove page from the page ++ * cache in ->releasepage(). Check for this. ++ */ ++ mapping = page->mapping; + if (!mapping && page_count(page) == 1) + goto free_it; + } +diff -puN mm/truncate.c~reget-page-mapping.diff mm/truncate.c +--- limbo/mm/truncate.c~reget-page-mapping.diff Tue Oct 21 16:43:13 2003 ++++ limbo-god/mm/truncate.c Tue Oct 21 16:43:28 2003 +@@ -74,6 +74,9 @@ invalidate_complete_page(struct address_ + if (PagePrivate(page) && !try_to_release_page(page, 0)) + return 0; + ++ if (page->mapping != mapping) ++ return 0; ++ + spin_lock(&mapping->page_lock); + if (PageDirty(page)) { + spin_unlock(&mapping->page_lock); + +_ diff -Nru a/patches/patches/reiser4-fs-Kconfig.diff.patch b/patches/patches/reiser4-fs-Kconfig.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/reiser4-fs-Kconfig.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,219 @@ + +diff -puN fs/Kconfig~reiser4-fs-Kconfig.diff fs/Kconfig +--- limbo/fs/Kconfig~reiser4-fs-Kconfig.diff Tue Oct 21 18:29:01 2003 ++++ limbo-god/fs/Kconfig Tue Oct 21 18:31:05 2003 +@@ -193,6 +193,212 @@ config FS_MBCACHE + default y if EXT2_FS=y || EXT3_FS=y + default m if EXT2_FS=m || EXT3_FS=m + ++config REISER4_FS ++ bool "Reiser4 (EXPERIMENTAL very fast general purpose filesystem)" ++ depends on EXPERIMENTAL ++ ---help--- ++ Reiser4 is more than twice as fast for both reads and writes as ++ ReiserFS. That means it is four times as fast as NTFS by Microsoft. ++ (Proper benchmarks will appear in a few months at ++ www.namesys.com/benchmarks.html, please be patient for now). ++ ++ It is the storage layer of what will become a general purpose naming ++ system --- like what Microsoft wants OFS to be except designed with a ++ clean new semantic layer rather than being SQL based like OFS. ++ ++ It performs all filesystem operations as atomic transactions, which ++ means that it either performs a write, or it does not, and in the ++ event of a crash it does not partially perform it or corrupt it. ++ ++ It stores files in dancing trees, which are like balanced trees but ++ faster. It packs small files together so that they share blocks ++ without wasting space. This means you can use it to store really ++ small files. It also means that it saves you disk space. It avoids ++ hassling you with anachronisms like having a maximum number of ++ inodes, and wasting space if you use less than that number. ++ ++ It can handle really large directories, because its search ++ algorithms are logarithmic with size not linear. With Reiser4 you ++ should use subdirectories because they help YOU, not because they ++ help your filesystem's performance, or because your filesystem won't ++ be able to shrink a directory once you have let it grow. For squid ++ and similar applications, everything in one directory should perform ++ better. ++ ++ It has a plugin-based infrastructure, which means that you can easily ++ invent new kinds of files, and so can other people, so it will evolve ++ rapidly. ++ ++ We will be adding a variety of security features to it that DARPA has ++ funded us to write. ++ ++ "reiser4" is a distinct filesystem mount type from "reiserfs" (V3), ++ which means that "reiserfs" filesystems will be unaffected by any ++ reiser4 bugs. They have no code in common. Reiser4 is a complete ++ rewrite from scratch fully incorporating what we learned by experience ++ while doing "reiserfs" the first time. That was a lot.;-) ++ ++ Reiser4 is about as stable as the usual tornado for now --- it is ++ for use by developers and testers only. We don't use it for our web ++ server --- you should not either. This will change before 2.6.0. ++ ReiserFS V3 is the right choice for those who want a filesystem so ++ stable that we can go for months now without any bug reports while we ++ have millions of users. ++ ++ If you'd like to upgrade from reiserfs to reiser4, use tar to a ++ temporary disk, maybe using NFS/ssh/SFS to get to that disk, or ask ++ your favorite distro to sponsor writing a conversion program. ++ ++ Sponsored by the Defensed Advanced Research Projects Agency (DARPA) ++ of the United States Government. DARPA does not endorse this ++ project, it merely sponsors it. ++ See http://www.darpa.mil/ato/programs/chats.htm ++ ++ To learn more about reiser4, go to http://www.namesys.com ++ ++config REISER4_FS_SYSCALL ++ bool ++# bool "Enable reiser4 system call" ++ default n ++ depends on REISER4_FS ++ ---help--- ++ Adds sys_reiser4() syscall. ++ This code is not in good shape yet and may not compile and stuff like that. ++ ++config REISER4_LARGE_KEY ++ bool "Use larger keys on reiser4 tree" ++ depends on REISER4_FS ++ default y ++ ---help--- ++ Make keys larger and use additional bits to order bodies of files within ++ a directory in the order of their names, which is what you want ++ normally. If you turn this off, file bodies will be ordered by creation ++ time, which is not optimal for most users. ++ ++ Warning: flipping this option makes your file system binary ++ incompatible. ++ ++config REISER4_CHECK ++ bool "Enable reiser4 debug options" ++ depends on REISER4_FS ++ ---help--- ++ Don't use this unless you are a developer debugging reiser4. If ++ using a kernel made by a distro that thinks they are our competitor ++ (sigh) rather than made by Linus, always check each release to make ++ sure they have not turned this on to make us look slow as was done ++ once in the past. This checks everything imaginable while reiser4 ++ runs. ++ ++ When adding features to reiser4 you should set this, and then ++ extensively test the code, and then send to us and we will test it ++ again. Include a description of what you did to test it. All ++ reiser4 code must be tested, reviewed, and signed off on by two ++ persons before it will be accepted into a stable kernel by Hans. ++ ++config REISER4_DEBUG ++ bool "Assertions" ++ depends on REISER4_CHECK ++ help ++ Turns on assertions checks. Eats a lot of CPU. ++ ++config REISER4_FS_SYSCALL_DEBUG ++ bool "Enable reiser4 system call debug" ++ depends on REISER4_CHECK ++ help ++ Turns on debug reiser4_system_call. ++ ++config REISER4_DEBUG_MODIFY ++ bool "Dirtying" ++ depends on REISER4_CHECK ++ help ++ Check that node is marked dirty each time it's modified. This is done ++ through maintaining checksum of node content. CPU hog. ++ ++config REISER4_DEBUG_MEMCPY ++ bool "Memory copying" ++ depends on REISER4_CHECK ++ help ++ Use special non-inlined versions on memcpy, memset, and memmove in ++ reiser4 to estimate amount of CPU time spent in data copying. ++ ++config REISER4_DEBUG_NODE ++ bool "Node consistency" ++ depends on REISER4_CHECK ++ help ++ Run consistency checks on nodes in balanced tree. CPU hog. ++ ++config REISER4_ZERO_NEW_NODE ++ bool "Node zeroing" ++ depends on REISER4_CHECK ++ help ++ Zero new node before use. ++ ++config REISER4_TRACE ++ bool "Tracing" ++ depends on REISER4_CHECK ++ help ++ Turn on tracing facility. This enables trace_flags mount option. ++ ++config REISER4_EVENT_LOG ++ bool "Log events" ++ depends on REISER4_CHECK ++ help ++ Log events into user supplied file. This enables trace_file mount option. ++ ++config REISER4_STATS ++ bool "Statistics" ++ depends on REISER4_CHECK ++ help ++ Turn on statistics collection. This increases size of in-memory super ++ block considerably. ++ ++config REISER4_PROF ++ bool "Profiling" ++ depends on REISER4_CHECK ++ help ++ Turn on collection of profiling information available through sysfs. ++ ++config REISER4_LOCKPROF ++ bool "Lock Profiling" ++ depends on REISER4_CHECK && PROFILING ++ help ++ Turn on collection of spin lock contention information. ++ ++config REISER4_DEBUG_OUTPUT ++ bool "Printing" ++ depends on REISER4_CHECK ++ help ++ Enable compilation of functions that print internal kernel data ++ structures in human readable form. Useful for debugging. ++ ++config REISER4_NOOPT ++ bool "Disable optimization" ++ depends on REISER4_CHECK ++ help ++ Disable compiler optimizations for reiser4 code. ++ ++config REISER4_USE_EFLUSH ++# bool "Enable emergency flush" ++ bool ++ default y ++ depends on REISER4_FS ++ help ++ Say Y unless you know what you are doing. Details are in reiser4/emergency_flush.c ++ ++config REISER4_COPY_ON_CAPTURE ++ bool "Enable copy on capture" ++ depends on REISER4_FS ++ help ++ Say N unless you know what you are doing. This is under development ++ ++config REISER4_BADBLOCKS ++ bool "Enable handling of badblocks in system areas" ++ depends on REISER4_FS ++ help ++ This allows you to use filesystems with badblocks in static reiser4 system areas ++ (such as superblock, bitmaps, journal header/footer). This imposes some performance ++ penalty, so say N unless you have such a filesystem. ++ + config REISERFS_FS + tristate "Reiserfs support" + help + +_ diff -Nru a/patches/patches/reiser4-fs-Makefile.diff.patch b/patches/patches/reiser4-fs-Makefile.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/reiser4-fs-Makefile.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,14 @@ + +diff -puN fs/Makefile~reiser4-fs-Makefile.diff fs/Makefile +--- limbo/fs/Makefile~reiser4-fs-Makefile.diff Tue Oct 21 16:42:40 2003 ++++ limbo-god/fs/Makefile Tue Oct 21 16:42:40 2003 +@@ -85,6 +85,7 @@ obj-$(CONFIG_AUTOFS_FS) += autofs/ + obj-$(CONFIG_AUTOFS4_FS) += autofs4/ + obj-$(CONFIG_ADFS_FS) += adfs/ + obj-$(CONFIG_REISERFS_FS) += reiserfs/ ++obj-$(CONFIG_REISER4_FS) += reiser4/ + obj-$(CONFIG_UDF_FS) += udf/ + obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ + obj-$(CONFIG_JFS_FS) += jfs/ + +_ diff -Nru a/patches/patches/sb_sync_inodes.diff.patch b/patches/patches/sb_sync_inodes.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/sb_sync_inodes.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,56 @@ + +diff -puN fs/fs-writeback.c~sb_sync_inodes.diff fs/fs-writeback.c +--- limbo/fs/fs-writeback.c~sb_sync_inodes.diff Tue Oct 21 16:42:41 2003 ++++ limbo-god/fs/fs-writeback.c Tue Oct 21 16:42:41 2003 +@@ -248,8 +248,8 @@ __writeback_single_inode(struct inode *i + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on __wait_on_inode. + */ +-static void +-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) ++void ++generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) + { + const unsigned long start = jiffies; /* livelock avoidance */ + +@@ -324,6 +324,16 @@ sync_sb_inodes(struct super_block *sb, s + return; /* Leave any unwritten inodes on s_io */ + } + ++static void ++sync_sb_inodes (struct super_block *sb, struct writeback_control *wbc) ++{ ++ if (sb->s_op->sync_inodes) ++ sb->s_op->sync_inodes(sb, wbc); ++ else ++ generic_sync_sb_inodes(sb, wbc); ++} ++ ++ + /* + * Start writeback of dirty pagecache data against all unlocked inodes. + * +diff -puN include/linux/writeback.h~sb_sync_inodes.diff include/linux/writeback.h +--- limbo/include/linux/writeback.h~sb_sync_inodes.diff Tue Oct 21 16:42:41 2003 ++++ limbo-god/include/linux/writeback.h Tue Oct 21 16:42:41 2003 +@@ -55,6 +55,7 @@ struct writeback_control { + * fs/fs-writeback.c + */ + void writeback_inodes(struct writeback_control *wbc); ++void generic_sync_sb_inodes(struct super_block * sb, struct writeback_control * wbc); + void wake_up_inode(struct inode *inode); + void __wait_on_inode(struct inode * inode); + void sync_inodes_sb(struct super_block *, int wait); +diff -puN include/linux/fs.h~sb_sync_inodes.diff include/linux/fs.h +--- limbo/include/linux/fs.h~sb_sync_inodes.diff Tue Oct 21 16:42:41 2003 ++++ limbo-god/include/linux/fs.h Tue Oct 21 16:42:41 2003 +@@ -871,6 +871,7 @@ struct super_operations { + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); + ++ void (*sync_inodes) (struct super_block *, struct writeback_control * wbc); + int (*show_options)(struct seq_file *, struct vfsmount *); + }; + + +_ diff -Nru a/patches/patches/spinlock-owner.diff.patch b/patches/patches/spinlock-owner.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/spinlock-owner.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,143 @@ + +diff -puN include/asm-i386/spinlock.h~spinlock-owner.diff include/asm-i386/spinlock.h +--- limbo/include/asm-i386/spinlock.h~spinlock-owner.diff Tue Oct 21 16:42:43 2003 ++++ limbo-god/include/asm-i386/spinlock.h Tue Oct 21 16:42:43 2003 +@@ -4,6 +4,7 @@ + #include <asm/atomic.h> + #include <asm/rwlock.h> + #include <asm/page.h> ++#include <asm/current.h> + #include <linux/config.h> + #include <linux/compiler.h> + +@@ -18,15 +19,31 @@ typedef struct { + volatile unsigned int lock; + #ifdef CONFIG_DEBUG_SPINLOCK + unsigned magic; ++ void *owner; + #endif + } spinlock_t; + + #define SPINLOCK_MAGIC 0xdead4ead + + #ifdef CONFIG_DEBUG_SPINLOCK +-#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC ++#define SPINLOCK_MAGIC_INIT , SPINLOCK_MAGIC, NULL ++#include <asm/current.h> ++#define SPIN_DONT_CHECK ((void *)1) ++static inline void spin_lock_dont_check(spinlock_t *lock) ++{ ++ lock->owner = SPIN_DONT_CHECK; ++} ++ ++#define spin_bug(lock) \ ++({ \ ++ printk("spinlock bug: %p: %x, %x, %p, %p\n", \ ++ lock, lock->lock, lock->magic, lock->owner, get_current()); \ ++ BUG(); \ ++}) ++ + #else + #define SPINLOCK_MAGIC_INIT /* */ ++#define spin_lock_dont_check(lock) + #endif + + #define SPIN_LOCK_UNLOCKED (spinlock_t) { 1 SPINLOCK_MAGIC_INIT } +@@ -71,8 +88,15 @@ typedef struct { + static inline void _raw_spin_unlock(spinlock_t *lock) + { + #ifdef CONFIG_DEBUG_SPINLOCK +- BUG_ON(lock->magic != SPINLOCK_MAGIC); +- BUG_ON(!spin_is_locked(lock)); ++ if (lock->magic != SPINLOCK_MAGIC) ++ spin_bug(lock); ++ if (lock->owner != SPIN_DONT_CHECK) { ++ if (!spin_is_locked(lock)) ++ spin_bug(lock); ++ if (lock->owner != get_current()) ++ spin_bug(lock); ++ lock->owner = NULL; ++ } + #endif + __asm__ __volatile__( + spin_unlock_string +@@ -90,8 +114,15 @@ static inline void _raw_spin_unlock(spin + { + char oldval = 1; + #ifdef CONFIG_DEBUG_SPINLOCK +- BUG_ON(lock->magic != SPINLOCK_MAGIC); +- BUG_ON(!spin_is_locked(lock)); ++ if (lock->magic != SPINLOCK_MAGIC) ++ spin_bug(lock); ++ if (lock->owner != SPIN_DONT_CHECK) { ++ if (!spin_is_locked(lock)) ++ spin_bug(lock); ++ if (lock->owner != get_current()) ++ spin_bug(lock); ++ lock->owner = NULL; ++ } + #endif + __asm__ __volatile__( + spin_unlock_string +@@ -107,6 +138,13 @@ static inline int _raw_spin_trylock(spin + "xchgb %b0,%1" + :"=q" (oldval), "=m" (lock->lock) + :"0" (0) : "memory"); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ if (oldval > 0 && lock->owner != SPIN_DONT_CHECK) { ++ if (lock->owner != 0) ++ spin_bug(lock); ++ lock->owner = get_current(); ++ } ++#endif + return oldval > 0; + } + +@@ -117,12 +155,19 @@ static inline void _raw_spin_lock(spinlo + here: + if (unlikely(lock->magic != SPINLOCK_MAGIC)) { + printk("eip: %p\n", &&here); +- BUG(); ++ spin_bug(lock); + } + #endif + __asm__ __volatile__( + spin_lock_string + :"=m" (lock->lock) : : "memory"); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ if (lock->owner != SPIN_DONT_CHECK) { ++ if (lock->owner != 0) ++ spin_bug(lock); ++ lock->owner = get_current(); ++ } ++#endif + } + + +diff -puN kernel/sched.c~spinlock-owner.diff kernel/sched.c +--- limbo/kernel/sched.c~spinlock-owner.diff Tue Oct 21 16:42:43 2003 ++++ limbo-god/kernel/sched.c Tue Oct 21 16:42:43 2003 +@@ -2809,6 +2809,7 @@ void __init sched_init(void) + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + spin_lock_init(&rq->lock); ++ spin_lock_dont_check(&rq->lock); + INIT_LIST_HEAD(&rq->migration_queue); + atomic_set(&rq->nr_iowait, 0); + nr_running_init(rq); +diff -puN include/linux/spinlock.h~spinlock-owner.diff include/linux/spinlock.h +--- limbo/include/linux/spinlock.h~spinlock-owner.diff Tue Oct 21 16:42:43 2003 ++++ limbo-god/include/linux/spinlock.h Tue Oct 21 16:42:43 2003 +@@ -180,8 +180,8 @@ typedef struct { + #define _raw_read_unlock(lock) do { (void)(lock); } while(0) + #define _raw_write_lock(lock) do { (void)(lock); } while(0) + #define _raw_write_unlock(lock) do { (void)(lock); } while(0) +-#define _raw_write_trylock(lock) ({ (void)(lock); (1); }) +- ++#define _raw_write_trylock(lock) ({ (void)(lock); (1); }) ++#define spin_lock_dont_check(lock) + #endif /* !SMP */ + + /* + +_ diff -Nru a/patches/patches/static-inline-quotaops.diff.patch b/patches/patches/static-inline-quotaops.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/static-inline-quotaops.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,51 @@ + +diff -puN include/linux/quotaops.h~static-inline-quotaops.diff include/linux/quotaops.h +--- limbo/include/linux/quotaops.h~static-inline-quotaops.diff Tue Oct 21 16:42:47 2003 ++++ limbo-god/include/linux/quotaops.h Tue Oct 21 16:42:47 2003 +@@ -174,38 +174,38 @@ static __inline__ int DQUOT_OFF(struct s + #define DQUOT_SYNC(sb) do { } while(0) + #define DQUOT_OFF(sb) do { } while(0) + #define DQUOT_TRANSFER(inode, iattr) (0) +-extern __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) ++static inline int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) + { + inode_add_bytes(inode, nr); + return 0; + } + +-extern __inline__ int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) ++static inline int DQUOT_PREALLOC_SPACE(struct inode *inode, qsize_t nr) + { + DQUOT_PREALLOC_SPACE_NODIRTY(inode, nr); + mark_inode_dirty(inode); + return 0; + } + +-extern __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) ++static inline int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) + { + inode_add_bytes(inode, nr); + return 0; + } + +-extern __inline__ int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) ++static inline int DQUOT_ALLOC_SPACE(struct inode *inode, qsize_t nr) + { + DQUOT_ALLOC_SPACE_NODIRTY(inode, nr); + mark_inode_dirty(inode); + return 0; + } + +-extern __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) ++static inline void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) + { + inode_sub_bytes(inode, nr); + } + +-extern __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) ++static inline void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) + { + DQUOT_FREE_SPACE_NODIRTY(inode, nr); + mark_inode_dirty(inode); + +_ diff -Nru a/patches/patches/truncate_mapping_pages_range.diff.patch b/patches/patches/truncate_mapping_pages_range.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/truncate_mapping_pages_range.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,60 @@ + +diff -puN include/linux/mm.h~truncate_mapping_pages_range.diff include/linux/mm.h +--- limbo/include/linux/mm.h~truncate_mapping_pages_range.diff Tue Oct 21 16:42:43 2003 ++++ limbo-god/include/linux/mm.h Tue Oct 21 16:42:43 2003 +@@ -566,6 +566,8 @@ can_vma_merge(struct vm_area_struct *vma + /* filemap.c */ + extern unsigned long page_unuse(struct page *); + extern void truncate_inode_pages(struct address_space *, loff_t); ++extern void truncate_mapping_pages_range(struct address_space *mapping, ++ pgoff_t lstart, long count); + + /* generic vm_area_ops exported for stackable file systems */ + extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int); +diff -puN mm/truncate.c~truncate_mapping_pages_range.diff mm/truncate.c +--- limbo/mm/truncate.c~truncate_mapping_pages_range.diff Tue Oct 21 16:42:43 2003 ++++ limbo-god/mm/truncate.c Tue Oct 21 16:42:43 2003 +@@ -178,6 +178,41 @@ void truncate_inode_pages(struct address + + EXPORT_SYMBOL(truncate_inode_pages); + ++void truncate_mapping_pages_range(struct address_space *mapping, ++ pgoff_t start, long count) ++{ ++ const pgoff_t end = start + count - 1; ++ struct pagevec pvec; ++ int i; ++ pgoff_t next; ++ ++ pagevec_init(&pvec, 0); ++ next = start; ++ while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { ++ for (i = 0; i < pagevec_count(&pvec); i++) { ++ struct page *page = pvec.pages[i]; ++ pgoff_t page_index = page->index; ++ ++ if (page_index > end) { ++ pagevec_release(&pvec); ++ return; ++ } ++ ++ lock_page(page); ++ wait_on_page_writeback(page); ++ ++ truncate_complete_page(mapping, page); ++ unlock_page(page); ++ ++ if (page_index > next) ++ next = page_index; ++ next++; ++ } ++ pagevec_release(&pvec); ++ cond_resched(); ++ } ++} ++ + /** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + +_ diff -Nru a/patches/patches/uml-AUTOCONF_INCLUDED.diff.patch b/patches/patches/uml-AUTOCONF_INCLUDED.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-AUTOCONF_INCLUDED.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,15 @@ + +diff -puN arch/um/kernel/user_syms.c~uml-AUTOCONF_INCLUDED.diff arch/um/kernel/user_syms.c +--- limbo/arch/um/kernel/user_syms.c~uml-AUTOCONF_INCLUDED.diff Tue Oct 21 16:42:51 2003 ++++ limbo-god/arch/um/kernel/user_syms.c Tue Oct 21 16:42:51 2003 +@@ -27,7 +27,7 @@ struct module_symbol + #define __MODULE_STRING_1(x) #x + #define __MODULE_STRING(x) __MODULE_STRING_1(x) + +-#if !defined(__AUTOCONF_INCLUDED__) ++#if !defined(AUTOCONF_INCLUDED) + + #define __EXPORT_SYMBOL(sym,str) error config_must_be_included_before_module + #define EXPORT_SYMBOL(var) error config_must_be_included_before_module + +_ diff -Nru a/patches/patches/uml-asm-cpufeature-h.diff.patch b/patches/patches/uml-asm-cpufeature-h.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-asm-cpufeature-h.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,13 @@ + +diff -puN /dev/null include/asm-um/cpufeature.h +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/include/asm-um/cpufeature.h Tue Oct 21 16:42:48 2003 +@@ -0,0 +1,6 @@ ++#ifndef _ASM_UM_CPUFEATUER_H ++#define _ASM_UM_CPUFEATUER_H ++ ++#include "asm/arch/cpufeature.h" ++ ++#endif + +_ diff -Nru a/patches/patches/uml-asm-local-h.diff.patch b/patches/patches/uml-asm-local-h.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-asm-local-h.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,13 @@ + +diff -puN /dev/null include/asm-um/local.h +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/include/asm-um/local.h Tue Oct 21 16:42:48 2003 +@@ -0,0 +1,6 @@ ++#ifndef _ASM_UM_LOCAL_H ++#define _ASM_UM_LOCAL_H ++ ++#include "asm/arch/local.h" ++ ++#endif + +_ diff -Nru a/patches/patches/uml-asm-module-i386.h.diff.patch b/patches/patches/uml-asm-module-i386.h.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-asm-module-i386.h.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,21 @@ + +diff -puN /dev/null include/asm-um/module-i386.h +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/include/asm-um/module-i386.h Tue Oct 21 16:42:55 2003 +@@ -0,0 +1,14 @@ ++#ifndef __UM_MODULE_I386_H ++#define __UM_MODULE_I386_H ++ ++/* UML is simple */ ++struct mod_arch_specific ++{ ++}; ++ ++#define Elf_Shdr Elf32_Shdr ++#define Elf_Sym Elf32_Sym ++#define Elf_Ehdr Elf32_Ehdr ++ ++/* __UM_MODULE_I386_H */ ++#endif + +_ diff -Nru a/patches/patches/uml-asm-sections.diff.patch b/patches/patches/uml-asm-sections.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-asm-sections.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,14 @@ + +diff -puN /dev/null include/asm-um/sections.h +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/include/asm-um/sections.h Tue Oct 21 16:42:54 2003 +@@ -0,0 +1,7 @@ ++#ifndef _UM_SECTIONS_H ++#define _UM_SECTIONS_H ++ ++/* nothing to see, move along */ ++#include <asm-generic/sections.h> ++ ++#endif + +_ diff -Nru a/patches/patches/uml-export-in-ksyms.c.diff.patch b/patches/patches/uml-export-in-ksyms.c.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-export-in-ksyms.c.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,33 @@ + +diff -puN arch/um/kernel/ksyms.c~uml-export-in-ksyms.c.diff arch/um/kernel/ksyms.c +--- limbo/arch/um/kernel/ksyms.c~uml-export-in-ksyms.c.diff Tue Oct 21 16:42:50 2003 ++++ limbo-god/arch/um/kernel/ksyms.c Tue Oct 21 16:42:50 2003 +@@ -90,3 +90,5 @@ EXPORT_SYMBOL(kunmap_atomic); + EXPORT_SYMBOL(kmap_atomic_to_page); + #endif + ++EXPORT_SYMBOL(do_gettimeofday); ++EXPORT_SYMBOL(do_settimeofday); +diff -puN arch/um/kernel/time.c~uml-export-in-ksyms.c.diff arch/um/kernel/time.c +--- limbo/arch/um/kernel/time.c~uml-export-in-ksyms.c.diff Tue Oct 21 16:42:50 2003 ++++ limbo-god/arch/um/kernel/time.c Tue Oct 21 16:42:50 2003 +@@ -97,8 +97,6 @@ void do_gettimeofday(struct timeval *tv) + time_unlock(flags); + } + +-EXPORT_SYMBOL(do_gettimeofday); +- + int do_settimeofday(struct timespec *tv) + { + struct timeval now; +@@ -119,8 +117,6 @@ int do_settimeofday(struct timespec *tv) + return(0); + } + +-EXPORT_SYMBOL(do_settimeofday); +- + void idle_sleep(int secs) + { + struct timespec ts; + +_ diff -Nru a/patches/patches/uml-kill-cow.diff.patch b/patches/patches/uml-kill-cow.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-kill-cow.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,59 @@ + +diff -puN arch/um/Kconfig_block~uml-kill-cow.diff arch/um/Kconfig_block +--- limbo/arch/um/Kconfig_block~uml-kill-cow.diff Tue Oct 21 16:42:53 2003 ++++ limbo-god/arch/um/Kconfig_block Tue Oct 21 16:42:53 2003 +@@ -41,7 +41,8 @@ config BLK_DEV_UBD_SYNC + + config BLK_DEV_COW_COMMON + bool +- default BLK_DEV_COW || BLK_DEV_UBD ++ default no ++# default BLK_DEV_COW || BLK_DEV_UBD + + config BLK_DEV_LOOP + tristate "Loopback device support" +diff -puN arch/um/defconfig~uml-kill-cow.diff arch/um/defconfig +--- limbo/arch/um/defconfig~uml-kill-cow.diff Tue Oct 21 16:42:53 2003 ++++ limbo-god/arch/um/defconfig Tue Oct 21 16:42:53 2003 +@@ -87,7 +87,7 @@ CONFIG_HOSTAUDIO=y + # + CONFIG_BLK_DEV_UBD=y + # CONFIG_BLK_DEV_UBD_SYNC is not set +-CONFIG_BLK_DEV_COW_COMMON=y ++CONFIG_BLK_DEV_COW_COMMON=n + CONFIG_BLK_DEV_LOOP=y + CONFIG_BLK_DEV_NBD=y + CONFIG_BLK_DEV_RAM=y +diff -puN arch/um/drivers/ubd_user.c~uml-kill-cow.diff arch/um/drivers/ubd_user.c +--- limbo/arch/um/drivers/ubd_user.c~uml-kill-cow.diff Tue Oct 21 16:42:53 2003 ++++ limbo-god/arch/um/drivers/ubd_user.c Tue Oct 21 16:42:53 2003 +@@ -24,7 +24,6 @@ + #include "user.h" + #include "ubd_user.h" + #include "os.h" +-#include "cow.h" + + #include <endian.h> + #include <byteswap.h> +@@ -97,9 +96,16 @@ int read_cow_bitmap(int fd, void *buf, i + return(0); + } + +-int open_ubd_file(char *file, struct openflags *openflags, +- char **backing_file_out, int *bitmap_offset_out, +- unsigned long *bitmap_len_out, int *data_offset_out, ++#define read_cow_header(file_reader, fd, magic, backing_file, mtime, size, sectorsize, bitmap_offset_out) (0) ++#define write_cow_header(file, fd, backing_file_out, sectorsize, size) (0) ++ ++#define cow_sizes(size, sectorsize, bitmap_offset_out, bitmap_len_out, data_offset_out) do {;} while(0) ++ ++#define init_cow_file(fd, cow_file, backing_file, sectorsize, bitmap_offset_out, bitmap_len_out, data_offset_out) (0) ++ ++int open_ubd_file(char *file, struct openflags *openflags, ++ char **backing_file_out, int *bitmap_offset_out, ++ unsigned long *bitmap_len_out, int *data_offset_out, + int *create_cow_out) + { + time_t mtime; + +_ diff -Nru a/patches/patches/uml-kill-irq_kern.h.diff.patch b/patches/patches/uml-kill-irq_kern.h.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-kill-irq_kern.h.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,94 @@ + +diff -puN arch/um/drivers/line.c~uml-kill-irq_kern.h.diff arch/um/drivers/line.c +--- limbo/arch/um/drivers/line.c~uml-kill-irq_kern.h.diff Tue Oct 21 16:42:49 2003 ++++ limbo-god/arch/um/drivers/line.c Tue Oct 21 16:42:49 2003 +@@ -16,7 +16,6 @@ + #include "user_util.h" + #include "kern_util.h" + #include "os.h" +-#include "irq_kern.h" + + #define LINE_BUFSIZE 4096 + +diff -puN arch/um/drivers/mconsole_kern.c~uml-kill-irq_kern.h.diff arch/um/drivers/mconsole_kern.c +--- limbo/arch/um/drivers/mconsole_kern.c~uml-kill-irq_kern.h.diff Tue Oct 21 16:42:49 2003 ++++ limbo-god/arch/um/drivers/mconsole_kern.c Tue Oct 21 16:42:49 2003 +@@ -27,9 +27,8 @@ + #include "init.h" + #include "os.h" + #include "umid.h" +-#include "irq_kern.h" + +-static int do_unlink_socket(struct notifier_block *notifier, ++static int do_unlink_socket(struct notifier_block *notifier, + unsigned long what, void *data) + { + return(mconsole_unlink_socket()); +diff -puN arch/um/drivers/net_kern.c~uml-kill-irq_kern.h.diff arch/um/drivers/net_kern.c +--- limbo/arch/um/drivers/net_kern.c~uml-kill-irq_kern.h.diff Tue Oct 21 16:42:49 2003 ++++ limbo-god/arch/um/drivers/net_kern.c Tue Oct 21 16:42:49 2003 +@@ -26,7 +26,6 @@ + #include "mconsole_kern.h" + #include "init.h" + #include "irq_user.h" +-#include "irq_kern.h" + + static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED; + LIST_HEAD(opened); +diff -puN arch/um/drivers/port_kern.c~uml-kill-irq_kern.h.diff arch/um/drivers/port_kern.c +--- limbo/arch/um/drivers/port_kern.c~uml-kill-irq_kern.h.diff Tue Oct 21 16:42:49 2003 ++++ limbo-god/arch/um/drivers/port_kern.c Tue Oct 21 16:42:49 2003 +@@ -15,7 +15,6 @@ + #include "kern_util.h" + #include "kern.h" + #include "irq_user.h" +-#include "irq_kern.h" + #include "port.h" + #include "init.h" + #include "os.h" +diff -puN arch/um/drivers/ubd_kern.c~uml-kill-irq_kern.h.diff arch/um/drivers/ubd_kern.c +--- limbo/arch/um/drivers/ubd_kern.c~uml-kill-irq_kern.h.diff Tue Oct 21 16:42:49 2003 ++++ limbo-god/arch/um/drivers/ubd_kern.c Tue Oct 21 16:42:49 2003 +@@ -49,7 +49,6 @@ + #include "mconsole_kern.h" + #include "init.h" + #include "irq_user.h" +-#include "irq_kern.h" + #include "ubd_user.h" + #include "2_5compat.h" + #include "os.h" +diff -puN arch/um/drivers/xterm_kern.c~uml-kill-irq_kern.h.diff arch/um/drivers/xterm_kern.c +--- limbo/arch/um/drivers/xterm_kern.c~uml-kill-irq_kern.h.diff Tue Oct 21 16:42:49 2003 ++++ limbo-god/arch/um/drivers/xterm_kern.c Tue Oct 21 16:42:49 2003 +@@ -11,7 +11,6 @@ + #include "linux/signal.h" + #include "asm/irq.h" + #include "irq_user.h" +-#include "irq_kern.h" + #include "kern_util.h" + #include "os.h" + #include "xterm.h" +diff -puN arch/um/kernel/irq.c~uml-kill-irq_kern.h.diff arch/um/kernel/irq.c +--- limbo/arch/um/kernel/irq.c~uml-kill-irq_kern.h.diff Tue Oct 21 16:42:49 2003 ++++ limbo-god/arch/um/kernel/irq.c Tue Oct 21 16:42:49 2003 +@@ -29,7 +29,6 @@ + #include "user_util.h" + #include "kern_util.h" + #include "irq_user.h" +-#include "irq_kern.h" + + static void register_irq_proc (unsigned int irq); + +diff -puN arch/um/kernel/sigio_kern.c~uml-kill-irq_kern.h.diff arch/um/kernel/sigio_kern.c +--- limbo/arch/um/kernel/sigio_kern.c~uml-kill-irq_kern.h.diff Tue Oct 21 16:42:49 2003 ++++ limbo-god/arch/um/kernel/sigio_kern.c Tue Oct 21 16:42:49 2003 +@@ -11,7 +11,6 @@ + #include "init.h" + #include "sigio.h" + #include "irq_user.h" +-#include "irq_kern.h" + + /* Protected by sigio_lock() called from write_sigio_workaround */ + static int sigio_irq_fd = -1; + +_ diff -Nru a/patches/patches/uml-sched_clock.diff.patch b/patches/patches/uml-sched_clock.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-sched_clock.diff.patch Fri Oct 31 14:10:54 2003 @@ -0,0 +1,28 @@ + +diff -puN arch/um/kernel/time_kern.c~uml-sched_clock.diff arch/um/kernel/time_kern.c +--- limbo/arch/um/kernel/time_kern.c~uml-sched_clock.diff Tue Oct 21 16:42:51 2003 ++++ limbo-god/arch/um/kernel/time_kern.c Tue Oct 21 16:42:51 2003 +@@ -30,11 +30,19 @@ int hz(void) + return(HZ); + } + ++/* ++ * Scheduler clock - returns current time in nanosec units. ++ */ ++unsigned long long sched_clock(void) ++{ ++ return (unsigned long long)jiffies_64 * (1000000000 / HZ); ++} ++ + /* Changed at early boot */ + int timer_irq_inited = 0; + +-/* missed_ticks will be modified after kernel memory has been +- * write-protected, so this puts it in a section which will be left ++/* missed_ticks will be modified after kernel memory has been ++ * write-protected, so this puts it in a section which will be left + * write-enabled. + */ + int __attribute__ ((__section__ (".unprotected"))) missed_ticks[NR_CPUS]; + +_ diff -Nru a/patches/patches/uml-summa.diff.patch b/patches/patches/uml-summa.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-summa.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,8215 @@ + +diff -puN arch/um/Kconfig~uml-summa.diff arch/um/Kconfig +--- limbo/arch/um/Kconfig~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/Kconfig Tue Oct 21 16:42:38 2003 +@@ -61,6 +61,20 @@ config MODE_SKAS + + config NET + bool "Networking support" ++ help ++ Unless you really know what you are doing, you should say Y here. ++ The reason is that some programs need kernel networking support even ++ when running on a stand-alone machine that isn't connected to any ++ other computer. If you are upgrading from an older kernel, you ++ should consider updating your networking tools too because changes ++ in the kernel and the tools often go hand in hand. The tools are ++ contained in the package net-tools, the location and version number ++ of which are given in Documentation/Changes. ++ ++ For a general introduction to Linux networking, it is highly ++ recommended to read the NET-HOWTO, available from ++ <http://www.tldp.org/docs.html#howto>. ++ + + source "fs/Kconfig.binfmt" + +@@ -85,6 +99,19 @@ config HOSTFS + If you'd like to be able to work with files stored on the host, + say Y or M here; otherwise say N. + ++config HPPFS ++ tristate "HoneyPot ProcFS" ++ help ++ hppfs (HoneyPot ProcFS) is a filesystem which allows UML /proc ++ entries to be overridden, removed, or fabricated from the host. ++ Its purpose is to allow a UML to appear to be a physical machine ++ by removing or changing anything in /proc which gives away the ++ identity of a UML. ++ ++ See http://user-mode-linux.sf.net/hppfs.html for more information. ++ ++ You only need this if you are setting up a UML honeypot. Otherwise, ++ it is safe to say 'N' here. + + config MCONSOLE + bool "Management console" +@@ -105,6 +132,16 @@ config MCONSOLE + config MAGIC_SYSRQ + bool "Magic SysRq key" + depends on MCONSOLE ++ help ++ If you say Y here, you will have some control over the system even ++ if the system crashes for example during kernel debugging (e.g., you ++ will be able to flush the buffer cache to disk, reboot the system ++ immediately or dump some status information). This is accomplished ++ by pressing various keys while holding SysRq (Alt+PrintScreen). It ++ also works on a serial console (on PC hardware at least), if you ++ send a BREAK and then within 5 seconds a command keypress. The ++ keys are documented in Documentation/sysrq.txt. Don't say Y ++ unless you really know what this hack does. + + config HOST_2G_2G + bool "2G/2G host address space split" +@@ -159,6 +196,9 @@ config KERNEL_HALF_GIGS + config HIGHMEM + bool "Highmem support" + ++config PROC_MM ++ bool "/proc/mm support" ++ + config KERNEL_STACK_ORDER + int "Kernel stack size order" + default 2 +@@ -181,10 +221,10 @@ config NETDEVICES + bool + default NET + +-source "arch/um/Kconfig_net" +- + source "net/Kconfig" + ++source "arch/um/Kconfig_net" ++ + source "fs/Kconfig" + + source "security/Kconfig" +@@ -239,6 +279,10 @@ config FRAME_POINTER + config PT_PROXY + bool "Enable ptrace proxy" + depends on XTERM_CHAN && DEBUG_INFO ++ help ++ This option enables a debugging interface which allows gdb to debug ++ the kernel without needing to actually attach to kernel threads. ++ If you want to do kernel debugging, say Y here; otherwise say N. + + config GPROF + bool "Enable gprof support" +diff -puN arch/um/Kconfig_block~uml-summa.diff arch/um/Kconfig_block +--- limbo/arch/um/Kconfig_block~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/Kconfig_block Tue Oct 21 16:42:38 2003 +@@ -29,6 +29,20 @@ config BLK_DEV_UBD_SYNC + wise choice too. In all other cases (for example, if you're just + playing around with User-Mode Linux) you can choose N. + ++# Turn this back on when the driver actually works ++# ++#config BLK_DEV_COW ++# tristate "COW block device" ++# help ++# This is a layered driver which sits above two other block devices. ++# One is read-only, and the other is a read-write layer which stores ++# all changes. This provides the illusion that the read-only layer ++# can be mounted read-write and changed. ++ ++config BLK_DEV_COW_COMMON ++ bool ++ default BLK_DEV_COW || BLK_DEV_UBD ++ + config BLK_DEV_LOOP + tristate "Loopback device support" + +diff -puN arch/um/Kconfig_net~uml-summa.diff arch/um/Kconfig_net +--- limbo/arch/um/Kconfig_net~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/Kconfig_net Tue Oct 21 16:42:38 2003 +@@ -1,5 +1,5 @@ + +-menu "Network Devices" ++menu "UML Network Devices" + depends on NET + + # UML virtual driver +@@ -176,73 +176,5 @@ config UML_NET_SLIRP + + Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" + +- +-# Below are hardware-independent drivers mirrored from +-# drivers/net/Config.in. It would be nice if Linux +-# had HW independent drivers separated from the other +-# but it does not. Until then each non-ISA/PCI arch +-# needs to provide it's own menu of network drivers +-config DUMMY +- tristate "Dummy net driver support" +- +-config BONDING +- tristate "Bonding driver support" +- +-config EQUALIZER +- tristate "EQL (serial line load balancing) support" +- +-config TUN +- tristate "Universal TUN/TAP device driver support" +- +-config ETHERTAP +- tristate "Ethertap network tap (OBSOLETE)" +- depends on EXPERIMENTAL && NETLINK +- +-config PPP +- tristate "PPP (point-to-point protocol) support" +- +-config PPP_MULTILINK +- bool "PPP multilink support (EXPERIMENTAL)" +- depends on PPP && EXPERIMENTAL +- +-config PPP_FILTER +- bool "PPP filtering" +- depends on PPP && FILTER +- +-config PPP_ASYNC +- tristate "PPP support for async serial ports" +- depends on PPP +- +-config PPP_SYNC_TTY +- tristate "PPP support for sync tty ports" +- depends on PPP +- +-config PPP_DEFLATE +- tristate "PPP Deflate compression" +- depends on PPP +- +-config PPP_BSDCOMP +- tristate "PPP BSD-Compress compression" +- depends on PPP +- +-config PPPOE +- tristate "PPP over Ethernet (EXPERIMENTAL)" +- depends on PPP && EXPERIMENTAL +- +-config SLIP +- tristate "SLIP (serial line) support" +- +-config SLIP_COMPRESSED +- bool "CSLIP compressed headers" +- depends on SLIP=y +- +-config SLIP_SMART +- bool "Keepalive and linefill" +- depends on SLIP=y +- +-config SLIP_MODE_SLIP6 +- bool "Six bit SLIP encapsulation" +- depends on SLIP=y +- + endmenu + +diff -puN arch/um/Makefile~uml-summa.diff arch/um/Makefile +--- limbo/arch/um/Makefile~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/Makefile Tue Oct 21 16:42:38 2003 +@@ -24,15 +24,17 @@ core-y += $(ARCH_DIR)/kernel/ \ + # Have to precede the include because the included Makefiles reference them. + SYMLINK_HEADERS = include/asm-um/archparam.h include/asm-um/system.h \ + include/asm-um/sigcontext.h include/asm-um/processor.h \ +- include/asm-um/ptrace.h include/asm-um/arch-signal.h ++ include/asm-um/ptrace.h include/asm-um/arch-signal.h \ ++ include/asm-um/module.h + + ARCH_SYMLINKS = include/asm-um/arch $(ARCH_DIR)/include/sysdep $(ARCH_DIR)/os \ + $(SYMLINK_HEADERS) $(ARCH_DIR)/include/uml-config.h + + GEN_HEADERS += $(ARCH_DIR)/include/task.h $(ARCH_DIR)/include/kern_constants.h + +-include $(ARCH_DIR)/Makefile-$(SUBARCH) +-include $(ARCH_DIR)/Makefile-os-$(OS) ++.PHONY: sys_prepare ++sys_prepare: ++ @: + + MAKEFILE-$(CONFIG_MODE_TT) += Makefile-tt + MAKEFILE-$(CONFIG_MODE_SKAS) += Makefile-skas +@@ -41,6 +43,9 @@ ifneq ($(MAKEFILE-y),) + include $(addprefix $(ARCH_DIR)/,$(MAKEFILE-y)) + endif + ++include $(ARCH_DIR)/Makefile-$(SUBARCH) ++include $(ARCH_DIR)/Makefile-os-$(OS) ++ + EXTRAVERSION := $(EXTRAVERSION)-1um + + ARCH_INCLUDE = -I$(ARCH_DIR)/include +@@ -52,14 +57,16 @@ ARCH_INCLUDE = -I$(ARCH_DIR)/include + + CFLAGS += $(CFLAGS-y) -D__arch_um__ -DSUBARCH=\"$(SUBARCH)\" \ + -D_LARGEFILE64_SOURCE $(ARCH_INCLUDE) -Derrno=kernel_errno \ +- $(MODE_INCLUDE) ++ -Dsigprocmask=kernel_sigprocmask $(MODE_INCLUDE) + + LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc + ++CONFIG_NEST_LEVEL ?= 0 ++CONFIG_KERNEL_HALF_GIGS ?= 1 + SIZE = (($(CONFIG_NEST_LEVEL) + $(CONFIG_KERNEL_HALF_GIGS)) * 0x20000000) + + ifeq ($(CONFIG_MODE_SKAS), y) +-$(SYS_HEADERS) : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h ++$(SYS_HEADERS) : $(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h + endif + + include/linux/version.h: arch/$(ARCH)/Makefile +@@ -98,17 +105,17 @@ CPP_MODE_TT := $(shell [ "$(CONFIG_MODE_ + CONFIG_KERNEL_STACK_ORDER ?= 2 + STACK_SIZE := $(shell echo $$[ 4096 * (1 << $(CONFIG_KERNEL_STACK_ORDER)) ] ) + +-AFLAGS_vmlinux.lds.o = -U$(SUBARCH) \ ++AFLAGS_vmlinux.lds.o = $(shell echo -U$(SUBARCH) \ + -DSTART=$$(($(TOP_ADDR) - $(SIZE))) -DELF_ARCH=$(ELF_ARCH) \ + -DELF_FORMAT=\"$(ELF_FORMAT)\" $(CPP_MODE_TT) \ +- -DKERNEL_STACK_SIZE=$(STACK_SIZE) ++ -DKERNEL_STACK_SIZE=$(STACK_SIZE)) + +-AFLAGS_$(LD_SCRIPT-y:.s=).o = $(AFLAGS_vmlinux.lds.o) -P -C -Uum ++export AFLAGS_$(LD_SCRIPT-y:.s=).o = $(AFLAGS_vmlinux.lds.o) -P -C -Uum + + LD_SCRIPT-y := $(ARCH_DIR)/$(LD_SCRIPT-y) + +-$(LD_SCRIPT-y) : $(LD_SCRIPT-y:.s=.S) scripts FORCE +- $(call if_changed_dep,as_s_S) ++#$(LD_SCRIPT-y) : $(LD_SCRIPT-y:.s=.S) scripts FORCE ++# $(call if_changed_dep,as_s_S) + + linux: vmlinux $(LD_SCRIPT-y) + $(CC) -Wl,-T,$(LD_SCRIPT-y) $(LINK-y) $(LINK_WRAPS) \ +@@ -116,6 +123,7 @@ linux: vmlinux $(LD_SCRIPT-y) + + USER_CFLAGS := $(patsubst -I%,,$(CFLAGS)) + USER_CFLAGS := $(patsubst -Derrno=kernel_errno,,$(USER_CFLAGS)) ++USER_CFLAGS := $(patsubst -Dsigprocmask=kernel_sigprocmask,,$(USER_CFLAGS)) + USER_CFLAGS := $(patsubst -D__KERNEL__,,$(USER_CFLAGS)) $(ARCH_INCLUDE) \ + $(MODE_INCLUDE) + +@@ -123,9 +131,10 @@ USER_CFLAGS := $(patsubst -D__KERNEL__,, + USER_CFLAGS += -D_GNU_SOURCE + + CLEAN_FILES += linux x.i gmon.out $(ARCH_DIR)/uml.lds.s \ +- $(ARCH_DIR)/dyn_link.ld.s $(GEN_HEADERS) ++ $(ARCH_DIR)/dyn_link.ld.s $(ARCH_DIR)/include/uml-config.h \ ++ $(GEN_HEADERS) + +-$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c ++$(ARCH_DIR)/main.o: $(ARCH_DIR)/main.c sys_prepare + $(CC) $(USER_CFLAGS) $(EXTRA_CFLAGS) -c -o $@ $< + + archmrproper: +@@ -161,19 +170,23 @@ $(ARCH_DIR)/include/sysdep: + $(ARCH_DIR)/os: + cd $(ARCH_DIR) && ln -sf os-$(OS) os + +-$(ARCH_DIR)/include/uml-config.h : ++$(ARCH_DIR)/include/uml-config.h : $(TOPDIR)/include/linux/autoconf.h + sed 's/ CONFIG/ UML_CONFIG/' $(TOPDIR)/include/linux/autoconf.h > $@ + ++filechk_$(ARCH_DIR)/include/task.h := $(ARCH_DIR)/util/mk_task ++ + $(ARCH_DIR)/include/task.h : $(ARCH_DIR)/util/mk_task +- $< > $@ ++ $(call filechk,$@) ++ ++filechk_$(ARCH_DIR)/include/kern_constants.h := $(ARCH_DIR)/util/mk_constants + + $(ARCH_DIR)/include/kern_constants.h : $(ARCH_DIR)/util/mk_constants +- $< > $@ ++ $(call filechk,$@) + +-$(ARCH_DIR)/util/mk_task : $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h \ +- $(ARCH_DIR)/util FORCE ; ++$(ARCH_DIR)/util/mk_task $(ARCH_DIR)/util/mk_constants : $(ARCH_DIR)/util \ ++ sys_prepare FORCE ; + + $(ARCH_DIR)/util: FORCE +- @$(call descend,$@,) ++ $(MAKE) -f scripts/Makefile.build obj=$@ + +-export SUBARCH USER_CFLAGS OS ++export SUBARCH USER_CFLAGS OS +diff -puN arch/um/Makefile-i386~uml-summa.diff arch/um/Makefile-i386 +--- limbo/arch/um/Makefile-i386~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/Makefile-i386 Tue Oct 21 16:42:38 2003 +@@ -16,22 +16,28 @@ SYS_UTIL_DIR := $(ARCH_DIR)/sys-i386/uti + + SYS_HEADERS = $(SYS_DIR)/sc.h $(SYS_DIR)/thread.h + ++sys_prepare: $(SYS_DIR)/sc.h ++ + prepare: $(SYS_HEADERS) + ++filechk_$(SYS_DIR)/sc.h := $(SYS_UTIL_DIR)/mk_sc ++ + $(SYS_DIR)/sc.h: $(SYS_UTIL_DIR)/mk_sc +- $< > $@ ++ $(call filechk,$@) ++ ++filechk_$(SYS_DIR)/thread.h := $(SYS_UTIL_DIR)/mk_thread + + $(SYS_DIR)/thread.h: $(SYS_UTIL_DIR)/mk_thread +- $< > $@ ++ $(call filechk,$@) + +-$(SYS_UTIL_DIR)/mk_sc: FORCE ; +- @$(call descend,$(SYS_UTIL_DIR),$@) ++$(SYS_UTIL_DIR)/mk_sc: scripts/fixdep include/config/MARKER FORCE ; ++ +@$(call descend,$(SYS_UTIL_DIR),$@) + +-$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) FORCE ; +- @$(call descend,$(SYS_UTIL_DIR),$@) ++$(SYS_UTIL_DIR)/mk_thread: $(ARCH_SYMLINKS) $(GEN_HEADERS) sys_prepare FORCE ; ++ +@$(call descend,$(SYS_UTIL_DIR),$@) + + $(SYS_UTIL_DIR): include/asm FORCE +- @$(call descend,$@,) ++ +@$(call descend,$@,) + + sysclean : + rm -f $(SYS_HEADERS) +diff -puN arch/um/Makefile-skas~uml-summa.diff arch/um/Makefile-skas +--- limbo/arch/um/Makefile-skas~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/Makefile-skas Tue Oct 21 16:42:38 2003 +@@ -14,7 +14,7 @@ MODE_INCLUDE += -I$(TOPDIR)/$(ARCH_DIR)/ + LINK_SKAS = -Wl,-rpath,/lib + LD_SCRIPT_SKAS = dyn.lds.s + +-GEN_HEADERS += $(ARCH_DIR)/kernel/skas/include/skas_ptregs.h ++GEN_HEADERS += $(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h + +-$(ARCH_DIR)/kernel/skas/include/skas_ptregs.h : +- $(MAKE) -C $(ARCH_DIR)/kernel/skas include/skas_ptregs.h ++$(TOPDIR)/$(ARCH_DIR)/include/skas_ptregs.h : ++ $(call descend,$(ARCH_DIR)/kernel/skas,$@) +diff -puN arch/um/config.release~uml-summa.diff arch/um/config.release +--- limbo/arch/um/config.release~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/config.release Tue Oct 21 16:42:38 2003 +@@ -228,7 +228,6 @@ CONFIG_ROMFS_FS=m + CONFIG_EXT2_FS=y + CONFIG_SYSV_FS=m + CONFIG_UDF_FS=m +-# CONFIG_UDF_RW is not set + CONFIG_UFS_FS=m + # CONFIG_UFS_FS_WRITE is not set + +diff -puN arch/um/defconfig~uml-summa.diff arch/um/defconfig +--- limbo/arch/um/defconfig~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/defconfig Tue Oct 21 16:42:38 2003 +@@ -3,29 +3,19 @@ + # + CONFIG_USERMODE=y + CONFIG_MMU=y +-CONFIG_SWAP=y + CONFIG_UID16=y + CONFIG_RWSEM_GENERIC_SPINLOCK=y +-CONFIG_CONFIG_LOG_BUF_SHIFT=14 +- +-# +-# Code maturity level options +-# +-CONFIG_EXPERIMENTAL=y + + # +-# General Setup ++# UML-specific options + # + CONFIG_MODE_TT=y + CONFIG_MODE_SKAS=y + CONFIG_NET=y +-CONFIG_SYSVIPC=y +-CONFIG_BSD_PROCESS_ACCT=y +-CONFIG_SYSCTL=y +-CONFIG_BINFMT_AOUT=y + CONFIG_BINFMT_ELF=y + CONFIG_BINFMT_MISC=y + CONFIG_HOSTFS=y ++CONFIG_HPPFS=y + CONFIG_MCONSOLE=y + CONFIG_MAGIC_SYSRQ=y + # CONFIG_HOST_2G_2G is not set +@@ -38,10 +28,38 @@ CONFIG_PROC_MM=y + CONFIG_KERNEL_STACK_ORDER=2 + + # ++# Code maturity level options ++# ++CONFIG_EXPERIMENTAL=y ++CONFIG_CLEAN_COMPILE=y ++CONFIG_STANDALONE=y ++CONFIG_BROKEN_ON_SMP=y ++ ++# ++# General setup ++# ++CONFIG_SWAP=y ++CONFIG_SYSVIPC=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_SYSCTL=y ++CONFIG_LOG_BUF_SHIFT=14 ++# CONFIG_IKCONFIG is not set ++# CONFIG_EMBEDDED is not set ++CONFIG_KALLSYMS=y ++CONFIG_FUTEX=y ++CONFIG_EPOLL=y ++CONFIG_IOSCHED_NOOP=y ++CONFIG_IOSCHED_AS=y ++CONFIG_IOSCHED_DEADLINE=y ++ ++# + # Loadable module support + # +-CONFIG_MODULES=y +-# CONFIG_KMOD is not set ++# CONFIG_MODULES is not set ++ ++# ++# Generic Driver Options ++# + + # + # Character Devices +@@ -69,6 +87,7 @@ CONFIG_HOSTAUDIO=y + # + CONFIG_BLK_DEV_UBD=y + # CONFIG_BLK_DEV_UBD_SYNC is not set ++CONFIG_BLK_DEV_COW_COMMON=y + CONFIG_BLK_DEV_LOOP=y + CONFIG_BLK_DEV_NBD=y + CONFIG_BLK_DEV_RAM=y +@@ -78,7 +97,7 @@ CONFIG_BLK_DEV_INITRD=y + CONFIG_NETDEVICES=y + + # +-# Network Devices ++# UML Network Devices + # + CONFIG_UML_NET=y + CONFIG_UML_NET_ETHERTAP=y +@@ -88,22 +107,6 @@ CONFIG_UML_NET_DAEMON=y + CONFIG_UML_NET_MCAST=y + # CONFIG_UML_NET_PCAP is not set + CONFIG_UML_NET_SLIRP=y +-CONFIG_DUMMY=y +-# CONFIG_BONDING is not set +-# CONFIG_EQUALIZER is not set +-CONFIG_TUN=y +-# CONFIG_ETHERTAP is not set +-CONFIG_PPP=y +-# CONFIG_PPP_MULTILINK is not set +-# CONFIG_PPP_ASYNC is not set +-# CONFIG_PPP_SYNC_TTY is not set +-# CONFIG_PPP_DEFLATE is not set +-# CONFIG_PPP_BSDCOMP is not set +-# CONFIG_PPPOE is not set +-CONFIG_SLIP=y +-# CONFIG_SLIP_COMPRESSED is not set +-# CONFIG_SLIP_SMART is not set +-# CONFIG_SLIP_MODE_SLIP6 is not set + + # + # Networking support +@@ -115,8 +118,6 @@ CONFIG_SLIP=y + CONFIG_PACKET=y + CONFIG_PACKET_MMAP=y + # CONFIG_NETLINK_DEV is not set +-# CONFIG_NETFILTER is not set +-# CONFIG_FILTER is not set + CONFIG_UNIX=y + # CONFIG_NET_KEY is not set + CONFIG_INET=y +@@ -130,8 +131,11 @@ CONFIG_INET=y + # CONFIG_SYN_COOKIES is not set + # CONFIG_INET_AH is not set + # CONFIG_INET_ESP is not set +-# CONFIG_XFRM_USER is not set ++# CONFIG_INET_IPCOMP is not set + # CONFIG_IPV6 is not set ++# CONFIG_DECNET is not set ++# CONFIG_BRIDGE is not set ++# CONFIG_NETFILTER is not set + + # + # SCTP Configuration (EXPERIMENTAL) +@@ -141,8 +145,6 @@ CONFIG_IPV6_SCTP__=y + # CONFIG_ATM is not set + # CONFIG_VLAN_8021Q is not set + # CONFIG_LLC is not set +-# CONFIG_DECNET is not set +-# CONFIG_BRIDGE is not set + # CONFIG_X25 is not set + # CONFIG_LAPB is not set + # CONFIG_NET_DIVERT is not set +@@ -160,6 +162,10 @@ CONFIG_IPV6_SCTP__=y + # Network testing + # + # CONFIG_NET_PKTGEN is not set ++CONFIG_DUMMY=y ++# CONFIG_BONDING is not set ++# CONFIG_EQUALIZER is not set ++CONFIG_TUN=y + + # + # Ethernet (10 or 100Mbit) +@@ -171,6 +177,22 @@ CONFIG_IPV6_SCTP__=y + # + + # ++# Ethernet (10000 Mbit) ++# ++CONFIG_PPP=y ++# CONFIG_PPP_MULTILINK is not set ++# CONFIG_PPP_FILTER is not set ++# CONFIG_PPP_ASYNC is not set ++# CONFIG_PPP_SYNC_TTY is not set ++# CONFIG_PPP_DEFLATE is not set ++# CONFIG_PPP_BSDCOMP is not set ++# CONFIG_PPPOE is not set ++CONFIG_SLIP=y ++# CONFIG_SLIP_COMPRESSED is not set ++# CONFIG_SLIP_SMART is not set ++# CONFIG_SLIP_MODE_SLIP6 is not set ++ ++# + # Wireless LAN (non-hamradio) + # + # CONFIG_NET_RADIO is not set +@@ -188,66 +210,82 @@ CONFIG_IPV6_SCTP__=y + # + # File systems + # ++CONFIG_EXT2_FS=y ++# CONFIG_EXT2_FS_XATTR is not set ++# CONFIG_EXT3_FS is not set ++# CONFIG_JBD is not set ++CONFIG_REISERFS_FS=y ++# CONFIG_REISERFS_CHECK is not set ++# CONFIG_REISERFS_PROC_INFO is not set ++# CONFIG_JFS_FS is not set ++# CONFIG_XFS_FS is not set ++CONFIG_MINIX_FS=y ++# CONFIG_ROMFS_FS is not set + CONFIG_QUOTA=y + # CONFIG_QFMT_V1 is not set + # CONFIG_QFMT_V2 is not set + CONFIG_QUOTACTL=y +-CONFIG_AUTOFS_FS=m +-CONFIG_AUTOFS4_FS=m +-CONFIG_REISERFS_FS=m +-# CONFIG_REISERFS_CHECK is not set +-# CONFIG_REISERFS_PROC_INFO is not set ++CONFIG_AUTOFS_FS=y ++CONFIG_AUTOFS4_FS=y ++ ++# ++# CD-ROM/DVD Filesystems ++# ++CONFIG_ISO9660_FS=y ++# CONFIG_JOLIET is not set ++# CONFIG_ZISOFS is not set ++# CONFIG_UDF_FS is not set ++ ++# ++# DOS/FAT/NT Filesystems ++# ++CONFIG_FAT_FS=y ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++# CONFIG_NTFS_FS is not set ++ ++# ++# Pseudo filesystems ++# ++CONFIG_PROC_FS=y ++CONFIG_DEVFS_FS=y ++CONFIG_DEVFS_MOUNT=y ++# CONFIG_DEVFS_DEBUG is not set ++CONFIG_DEVPTS_FS=y ++# CONFIG_DEVPTS_FS_XATTR is not set ++# CONFIG_TMPFS is not set ++CONFIG_RAMFS=y ++ ++# ++# Miscellaneous filesystems ++# + # CONFIG_ADFS_FS is not set + # CONFIG_AFFS_FS is not set + # CONFIG_HFS_FS is not set + # CONFIG_BEFS_FS is not set + # CONFIG_BFS_FS is not set +-# CONFIG_EXT3_FS is not set +-# CONFIG_JBD is not set +-CONFIG_FAT_FS=m +-CONFIG_MSDOS_FS=m +-CONFIG_VFAT_FS=m + # CONFIG_EFS_FS is not set + CONFIG_JFFS_FS=y + CONFIG_JFFS_FS_VERBOSE=0 +-CONFIG_JFFS_PROC_FS=y + # CONFIG_JFFS2_FS is not set + # CONFIG_CRAMFS is not set +-# CONFIG_TMPFS is not set +-CONFIG_RAMFS=y +-CONFIG_ISO9660_FS=m +-# CONFIG_JOLIET is not set +-# CONFIG_ZISOFS is not set +-# CONFIG_JFS_FS is not set +-CONFIG_MINIX_FS=m + # CONFIG_VXFS_FS is not set +-# CONFIG_NTFS_FS is not set + # CONFIG_HPFS_FS is not set +-CONFIG_PROC_FS=y +-CONFIG_DEVFS_FS=y +-CONFIG_DEVFS_MOUNT=y +-# CONFIG_DEVFS_DEBUG is not set +-CONFIG_DEVPTS_FS=y + # CONFIG_QNX4FS_FS is not set +-# CONFIG_ROMFS_FS is not set +-CONFIG_EXT2_FS=y +-# CONFIG_EXT2_FS_XATTR is not set + # CONFIG_SYSV_FS is not set +-# CONFIG_UDF_FS is not set + # CONFIG_UFS_FS is not set +-# CONFIG_XFS_FS is not set + + # + # Network File Systems + # +-# CONFIG_CODA_FS is not set +-# CONFIG_INTERMEZZO_FS is not set + # CONFIG_NFS_FS is not set + # CONFIG_NFSD is not set + # CONFIG_EXPORTFS is not set +-# CONFIG_CIFS is not set + # CONFIG_SMB_FS is not set ++# CONFIG_CIFS is not set + # CONFIG_NCP_FS is not set ++# CONFIG_CODA_FS is not set ++# CONFIG_INTERMEZZO_FS is not set + # CONFIG_AFS_FS is not set + + # +@@ -317,28 +355,7 @@ CONFIG_NLS_DEFAULT="iso8859-1" + # + # SCSI support + # +-CONFIG_SCSI=y +-CONFIG_GENERIC_ISA_DMA=y +- +-# +-# SCSI support type (disk, tape, CD-ROM) +-# +-CONFIG_BLK_DEV_SD=y +-CONFIG_SD_EXTRA_DEVS=40 +-CONFIG_CHR_DEV_ST=y +-CONFIG_BLK_DEV_SR=y +-CONFIG_BLK_DEV_SR_VENDOR=y +-CONFIG_SR_EXTRA_DEVS=2 +-CONFIG_CHR_DEV_SG=y +- +-# +-# Some SCSI devices (e.g. CD jukebox) support multiple LUNs +-# +-CONFIG_SCSI_DEBUG_QUEUES=y +-CONFIG_SCSI_MULTI_LUN=y +-CONFIG_SCSI_CONSTANTS=y +-CONFIG_SCSI_LOGGING=y +-CONFIG_SCSI_DEBUG=y ++# CONFIG_SCSI is not set + + # + # Multi-device support (RAID and LVM) +@@ -360,6 +377,7 @@ CONFIG_MTD_CHAR=y + CONFIG_MTD_BLOCK=y + # CONFIG_FTL is not set + # CONFIG_NFTL is not set ++# CONFIG_INFTL is not set + + # + # RAM/ROM/Flash chip drivers +@@ -374,20 +392,21 @@ CONFIG_MTD_BLOCK=y + # + # Mapping drivers for chip access + # ++# CONFIG_MTD_COMPLEX_MAPPINGS is not set + + # + # Self-contained MTD device drivers + # + # CONFIG_MTD_SLRAM is not set + # CONFIG_MTD_MTDRAM is not set +-CONFIG_MTD_BLKMTD=m ++CONFIG_MTD_BLKMTD=y + + # + # Disk-On-Chip Device Drivers + # +-# CONFIG_MTD_DOC1000 is not set + # CONFIG_MTD_DOC2000 is not set + # CONFIG_MTD_DOC2001 is not set ++# CONFIG_MTD_DOC2001PLUS is not set + + # + # NAND Flash Device Drivers +diff -puN arch/um/drivers/Makefile~uml-summa.diff arch/um/drivers/Makefile +--- limbo/arch/um/drivers/Makefile~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/Makefile Tue Oct 21 16:42:38 2003 +@@ -1,5 +1,5 @@ + # +-# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) ++# Copyright (C) 2000, 2002, 2003 Jeff Dike (jdike@karaya.com) + # Licensed under the GPL + # + +@@ -39,6 +39,8 @@ obj-$(CONFIG_PTY_CHAN) += pty.o + obj-$(CONFIG_TTY_CHAN) += tty.o + obj-$(CONFIG_XTERM_CHAN) += xterm.o xterm_kern.o + obj-$(CONFIG_UML_WATCHDOG) += harddog.o ++obj-$(CONFIG_BLK_DEV_COW) += cow_kern.o ++obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o + + obj-y += stdio_console.o $(CHAN_OBJS) + +@@ -46,7 +48,7 @@ USER_SINGLE_OBJS = $(foreach f,$(patsubs + + USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(USER_SINGLE_OBJS)) fd.o \ + null.o pty.o tty.o xterm.o +-USER_OBJS := $(foreach file,$(USER_OBJS),arch/um/drivers/$(file)) ++USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) + + $(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< +diff -puN arch/um/drivers/chan_kern.c~uml-summa.diff arch/um/drivers/chan_kern.c +--- limbo/arch/um/drivers/chan_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/chan_kern.c Tue Oct 21 16:42:38 2003 +@@ -8,6 +8,7 @@ + #include <linux/list.h> + #include <linux/slab.h> + #include <linux/tty.h> ++#include <linux/string.h> + #include <linux/tty_flip.h> + #include <asm/irq.h> + #include "chan_kern.h" +diff -puN arch/um/drivers/chan_user.c~uml-summa.diff arch/um/drivers/chan_user.c +--- limbo/arch/um/drivers/chan_user.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/chan_user.c Tue Oct 21 16:42:38 2003 +@@ -188,8 +188,8 @@ void register_winch(int fd, void *device + if(!isatty(fd)) return; + + pid = tcgetpgrp(fd); +- if(!CHOOSE_MODE(is_tracer_winch(pid, fd, device_data), 0) && +- (pid == -1)){ ++ if(!CHOOSE_MODE_PROC(is_tracer_winch, is_skas_winch, pid, fd, ++ device_data) && (pid == -1)){ + thread = winch_tramp(fd, device_data, &thread_fd); + if(fd != -1){ + register_winch_irq(thread_fd, fd, thread, device_data); +diff -puN arch/um/drivers/hostaudio_kern.c~uml-summa.diff arch/um/drivers/hostaudio_kern.c +--- limbo/arch/um/drivers/hostaudio_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/hostaudio_kern.c Tue Oct 21 16:42:38 2003 +@@ -11,6 +11,7 @@ + #include "linux/fs.h" + #include "linux/sound.h" + #include "linux/soundcard.h" ++#include "asm/uaccess.h" + #include "kern_util.h" + #include "init.h" + #include "hostaudio.h" +@@ -22,7 +23,7 @@ char *mixer = HOSTAUDIO_DEV_MIXER; + #ifndef MODULE + static int set_dsp(char *name, int *add) + { +- dsp = uml_strdup(name); ++ dsp = name; + return(0); + } + +@@ -34,7 +35,7 @@ __uml_setup("dsp=", set_dsp, + + static int set_mixer(char *name, int *add) + { +- mixer = uml_strdup(name); ++ mixer = name; + return(0); + } + +@@ -51,23 +52,55 @@ static ssize_t hostaudio_read(struct fil + loff_t *ppos) + { + struct hostaudio_state *state = file->private_data; ++ void *kbuf; ++ int err; + + #ifdef DEBUG + printk("hostaudio: read called, count = %d\n", count); + #endif + +- return(hostaudio_read_user(state, buffer, count, ppos)); ++ kbuf = kmalloc(count, GFP_KERNEL); ++ if(kbuf == NULL) ++ return(-ENOMEM); ++ ++ err = hostaudio_read_user(state, kbuf, count, ppos); ++ if(err < 0) ++ goto out; ++ ++ if(copy_to_user(buffer, kbuf, err)) ++ err = -EFAULT; ++ ++ out: ++ kfree(kbuf); ++ return(err); + } + + static ssize_t hostaudio_write(struct file *file, const char *buffer, + size_t count, loff_t *ppos) + { + struct hostaudio_state *state = file->private_data; ++ void *kbuf; ++ int err; + + #ifdef DEBUG + printk("hostaudio: write called, count = %d\n", count); + #endif +- return(hostaudio_write_user(state, buffer, count, ppos)); ++ ++ kbuf = kmalloc(count, GFP_KERNEL); ++ if(kbuf == NULL) ++ return(-ENOMEM); ++ ++ err = -EFAULT; ++ if(copy_from_user(kbuf, buffer, count)) ++ goto out; ++ ++ err = hostaudio_write_user(state, kbuf, count, ppos); ++ if(err < 0) ++ goto out; ++ ++ out: ++ kfree(kbuf); ++ return(err); + } + + static unsigned int hostaudio_poll(struct file *file, +@@ -86,12 +119,43 @@ static int hostaudio_ioctl(struct inode + unsigned int cmd, unsigned long arg) + { + struct hostaudio_state *state = file->private_data; ++ unsigned long data = 0; ++ int err; + + #ifdef DEBUG + printk("hostaudio: ioctl called, cmd = %u\n", cmd); + #endif ++ switch(cmd){ ++ case SNDCTL_DSP_SPEED: ++ case SNDCTL_DSP_STEREO: ++ case SNDCTL_DSP_GETBLKSIZE: ++ case SNDCTL_DSP_CHANNELS: ++ case SNDCTL_DSP_SUBDIVIDE: ++ case SNDCTL_DSP_SETFRAGMENT: ++ if(get_user(data, (int *) arg)) ++ return(-EFAULT); ++ break; ++ default: ++ break; ++ } ++ ++ err = hostaudio_ioctl_user(state, cmd, (unsigned long) &data); ++ ++ switch(cmd){ ++ case SNDCTL_DSP_SPEED: ++ case SNDCTL_DSP_STEREO: ++ case SNDCTL_DSP_GETBLKSIZE: ++ case SNDCTL_DSP_CHANNELS: ++ case SNDCTL_DSP_SUBDIVIDE: ++ case SNDCTL_DSP_SETFRAGMENT: ++ if(put_user(data, (int *) arg)) ++ return(-EFAULT); ++ break; ++ default: ++ break; ++ } + +- return(hostaudio_ioctl_user(state, cmd, arg)); ++ return(err); + } + + static int hostaudio_open(struct inode *inode, struct file *file) +@@ -225,7 +289,8 @@ MODULE_LICENSE("GPL"); + + static int __init hostaudio_init_module(void) + { +- printk(KERN_INFO "UML Audio Relay\n"); ++ printk(KERN_INFO "UML Audio Relay (host dsp = %s, host mixer = %s)\n", ++ dsp, mixer); + + module_data.dev_audio = register_sound_dsp(&hostaudio_fops, -1); + if(module_data.dev_audio < 0){ +diff -puN arch/um/drivers/line.c~uml-summa.diff arch/um/drivers/line.c +--- limbo/arch/um/drivers/line.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/line.c Tue Oct 21 16:42:38 2003 +@@ -6,8 +6,8 @@ + #include "linux/sched.h" + #include "linux/slab.h" + #include "linux/list.h" ++#include "linux/interrupt.h" + #include "linux/devfs_fs_kernel.h" +-#include "asm/irq.h" + #include "asm/uaccess.h" + #include "chan_kern.h" + #include "irq_user.h" +@@ -16,16 +16,18 @@ + #include "user_util.h" + #include "kern_util.h" + #include "os.h" ++#include "irq_kern.h" + + #define LINE_BUFSIZE 4096 + +-void line_interrupt(int irq, void *data, struct pt_regs *unused) ++irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused) + { + struct line *dev = data; + + if(dev->count > 0) + chan_interrupt(&dev->chan_list, &dev->task, dev->tty, irq, + dev); ++ return IRQ_HANDLED; + } + + void line_timer_cb(void *arg) +@@ -136,20 +138,22 @@ int line_write(struct line *lines, struc + return(len); + } + +-void line_write_interrupt(int irq, void *data, struct pt_regs *unused) ++irqreturn_t line_write_interrupt(int irq, void *data, struct pt_regs *unused) + { + struct line *dev = data; + struct tty_struct *tty = dev->tty; + int err; + + err = flush_buffer(dev); +- if(err == 0) return; ++ if(err == 0) ++ return(IRQ_NONE); + else if(err < 0){ + dev->head = dev->buffer; + dev->tail = dev->buffer; + } + +- if(tty == NULL) return; ++ if(tty == NULL) ++ return(IRQ_NONE); + + if(test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && + (tty->ldisc.write_wakeup != NULL)) +@@ -161,9 +165,9 @@ void line_write_interrupt(int irq, void + * writes. + */ + +- if (waitqueue_active(&tty->write_wait)) ++ if(waitqueue_active(&tty->write_wait)) + wake_up_interruptible(&tty->write_wait); +- ++ return(IRQ_HANDLED); + } + + int line_write_room(struct tty_struct *tty) +@@ -369,7 +373,7 @@ int line_get_config(char *name, struct l + + dev = simple_strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ +- *error_out = "line_setup failed to parse device number"; ++ *error_out = "line_get_config failed to parse device number"; + return(0); + } + +@@ -379,15 +383,15 @@ int line_get_config(char *name, struct l + } + + line = &lines[dev]; ++ + down(&line->sem); +- + if(!line->valid) + CONFIG_CHUNK(str, size, n, "none", 1); + else if(line->count == 0) + CONFIG_CHUNK(str, size, n, line->init_str, 1); + else n = chan_config_string(&line->chan_list, str, size, error_out); +- + up(&line->sem); ++ + return(n); + } + +@@ -412,7 +416,8 @@ struct tty_driver *line_register_devfs(s + return NULL; + + driver->driver_name = line_driver->name; +- driver->name = line_driver->devfs_name; ++ driver->name = line_driver->device_name; ++ driver->devfs_name = line_driver->devfs_name; + driver->major = line_driver->major; + driver->minor_start = line_driver->minor_start; + driver->type = line_driver->type; +@@ -432,7 +437,7 @@ struct tty_driver *line_register_devfs(s + + for(i = 0; i < nlines; i++){ + if(!lines[i].valid) +- tty_unregister_devfs(driver, i); ++ tty_unregister_device(driver, i); + } + + mconsole_register_dev(&line_driver->mc); +@@ -465,24 +470,25 @@ struct winch { + struct line *line; + }; + +-void winch_interrupt(int irq, void *data, struct pt_regs *unused) ++irqreturn_t winch_interrupt(int irq, void *data, struct pt_regs *unused) + { + struct winch *winch = data; + struct tty_struct *tty; + int err; + char c; + +- err = generic_read(winch->fd, &c, NULL); +- if(err < 0){ +- if(err != -EAGAIN){ +- printk("winch_interrupt : read failed, errno = %d\n", +- -err); +- printk("fd %d is losing SIGWINCH support\n", +- winch->tty_fd); +- free_irq(irq, data); +- return; ++ if(winch->fd != -1){ ++ err = generic_read(winch->fd, &c, NULL); ++ if(err < 0){ ++ if(err != -EAGAIN){ ++ printk("winch_interrupt : read failed, " ++ "errno = %d\n", -err); ++ printk("fd %d is losing SIGWINCH support\n", ++ winch->tty_fd); ++ return(IRQ_HANDLED); ++ } ++ goto out; + } +- goto out; + } + tty = winch->line->tty; + if(tty != NULL){ +@@ -492,7 +498,9 @@ void winch_interrupt(int irq, void *data + kill_pg(tty->pgrp, SIGWINCH, 1); + } + out: +- reactivate_fd(winch->fd, WINCH_IRQ); ++ if(winch->fd != -1) ++ reactivate_fd(winch->fd, WINCH_IRQ); ++ return(IRQ_HANDLED); + } + + DECLARE_MUTEX(winch_handler_sem); +@@ -529,7 +537,10 @@ static void winch_cleanup(void) + + list_for_each(ele, &winch_handlers){ + winch = list_entry(ele, struct winch, list); +- close(winch->fd); ++ if(winch->fd != -1){ ++ deactivate_fd(winch->fd, WINCH_IRQ); ++ close(winch->fd); ++ } + if(winch->pid != -1) + os_kill_process(winch->pid, 1); + } +diff -puN arch/um/drivers/mconsole_kern.c~uml-summa.diff arch/um/drivers/mconsole_kern.c +--- limbo/arch/um/drivers/mconsole_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/mconsole_kern.c Tue Oct 21 16:42:38 2003 +@@ -27,6 +27,7 @@ + #include "init.h" + #include "os.h" + #include "umid.h" ++#include "irq_kern.h" + + static int do_unlink_socket(struct notifier_block *notifier, + unsigned long what, void *data) +@@ -67,7 +68,7 @@ void mc_work_proc(void *unused) + + DECLARE_WORK(mconsole_work, mc_work_proc, NULL); + +-void mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++irqreturn_t mconsole_interrupt(int irq, void *dev_id, struct pt_regs *regs) + { + int fd; + struct mconsole_entry *new; +@@ -88,6 +89,7 @@ void mconsole_interrupt(int irq, void *d + } + if(!list_empty(&mc_requests)) schedule_work(&mconsole_work); + reactivate_fd(fd, MCONSOLE_IRQ); ++ return(IRQ_HANDLED); + } + + void mconsole_version(struct mc_request *req) +@@ -100,20 +102,34 @@ void mconsole_version(struct mc_request + mconsole_reply(req, version, 0, 0); + } + ++void mconsole_log(struct mc_request *req) ++{ ++ int len; ++ char *ptr = req->request.data; ++ ++ ptr += strlen("log"); ++ while(isspace(*ptr)) ptr++; ++ ++ len = ptr - req->request.data; ++ printk("%.*s", len, ptr); ++ mconsole_reply(req, "", 0, 0); ++} ++ + #define UML_MCONSOLE_HELPTEXT \ +-"Commands: +- version - Get kernel version +- help - Print this message +- halt - Halt UML +- reboot - Reboot UML +- config <dev>=<config> - Add a new device to UML; +- same syntax as command line +- config <dev> - Query the configuration of a device +- remove <dev> - Remove a device from UML +- sysrq <letter> - Performs the SysRq action controlled by the letter +- cad - invoke the Ctl-Alt-Del handler +- stop - pause the UML; it will do nothing until it receives a 'go' +- go - continue the UML after a 'stop' ++"Commands: \n\ ++ version - Get kernel version \n\ ++ help - Print this message \n\ ++ halt - Halt UML \n\ ++ reboot - Reboot UML \n\ ++ config <dev>=<config> - Add a new device to UML; \n\ ++ same syntax as command line \n\ ++ config <dev> - Query the configuration of a device \n\ ++ remove <dev> - Remove a device from UML \n\ ++ sysrq <letter> - Performs the SysRq action controlled by the letter \n\ ++ cad - invoke the Ctl-Alt-Del handler \n\ ++ stop - pause the UML; it will do nothing until it receives a 'go' \n\ ++ go - continue the UML after a 'stop' \n\ ++ log <string> - make UML enter <string> into the kernel log\n\ + " + + void mconsole_help(struct mc_request *req) +@@ -302,7 +318,7 @@ int mconsole_init(void) + if(umid_file_name("mconsole", file, sizeof(file))) return(-1); + snprintf(mconsole_socket_name, sizeof(file), "%s", file); + +- sock = create_unix_socket(file, sizeof(file)); ++ sock = create_unix_socket(file, sizeof(file), 1); + if (sock < 0){ + printk("Failed to initialize management console\n"); + return(1); +diff -puN arch/um/drivers/mconsole_user.c~uml-summa.diff arch/um/drivers/mconsole_user.c +--- limbo/arch/um/drivers/mconsole_user.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/mconsole_user.c Tue Oct 21 16:42:38 2003 +@@ -28,6 +28,7 @@ static struct mconsole_command commands[ + { "cad", mconsole_cad, 1 }, + { "stop", mconsole_stop, 0 }, + { "go", mconsole_go, 1 }, ++ { "log", mconsole_log, 1 }, + }; + + /* Initialized in mconsole_init, which is an initcall */ +@@ -139,6 +140,7 @@ int mconsole_reply(struct mc_request *re + memcpy(reply.data, str, len); + reply.data[len] = '\0'; + total -= len; ++ str += len; + reply.len = len + 1; + + len = sizeof(reply) + reply.len - sizeof(reply.data); +diff -puN arch/um/drivers/mmapper_kern.c~uml-summa.diff arch/um/drivers/mmapper_kern.c +--- limbo/arch/um/drivers/mmapper_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/mmapper_kern.c Tue Oct 21 16:42:38 2003 +@@ -120,7 +120,10 @@ static int __init mmapper_init(void) + printk(KERN_INFO "Mapper v0.1\n"); + + v_buf = (char *) find_iomem("mmapper", &mmapper_size); +- if(mmapper_size == 0) return(0); ++ if(mmapper_size == 0){ ++ printk(KERN_ERR "mmapper_init - find_iomem failed\n"); ++ return(0); ++ } + + p_buf = __pa(v_buf); + +diff -puN arch/um/drivers/net_kern.c~uml-summa.diff arch/um/drivers/net_kern.c +--- limbo/arch/um/drivers/net_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/net_kern.c Tue Oct 21 16:42:38 2003 +@@ -26,6 +26,7 @@ + #include "mconsole_kern.h" + #include "init.h" + #include "irq_user.h" ++#include "irq_kern.h" + + static spinlock_t opened_lock = SPIN_LOCK_UNLOCKED; + LIST_HEAD(opened); +@@ -61,14 +62,14 @@ static int uml_net_rx(struct net_device + return pkt_len; + } + +-void uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++irqreturn_t uml_net_interrupt(int irq, void *dev_id, struct pt_regs *regs) + { + struct net_device *dev = dev_id; + struct uml_net_private *lp = dev->priv; + int err; + + if(!netif_running(dev)) +- return; ++ return(IRQ_NONE); + + spin_lock(&lp->lock); + while((err = uml_net_rx(dev)) > 0) ; +@@ -83,6 +84,7 @@ void uml_net_interrupt(int irq, void *de + + out: + spin_unlock(&lp->lock); ++ return(IRQ_HANDLED); + } + + static int uml_net_open(struct net_device *dev) +@@ -252,37 +254,6 @@ void uml_net_user_timer_expire(unsigned + #endif + } + +-/* +- * default do nothing hard header packet routines for struct net_device init. +- * real ethernet transports will overwrite with real routines. +- */ +-static int uml_net_hard_header(struct sk_buff *skb, struct net_device *dev, +- unsigned short type, void *daddr, void *saddr, unsigned len) +-{ +- return(0); /* no change */ +-} +- +-static int uml_net_rebuild_header(struct sk_buff *skb) +-{ +- return(0); /* ignore */ +-} +- +-static int uml_net_header_cache(struct neighbour *neigh, struct hh_cache *hh) +-{ +- return(-1); /* fail */ +-} +- +-static void uml_net_header_cache_update(struct hh_cache *hh, +- struct net_device *dev, unsigned char * haddr) +-{ +- /* ignore */ +-} +- +-static int uml_net_header_parse(struct sk_buff *skb, unsigned char *haddr) +-{ +- return(0); /* nothing */ +-} +- + static spinlock_t devices_lock = SPIN_LOCK_UNLOCKED; + static struct list_head devices = LIST_HEAD_INIT(devices); + +@@ -292,7 +263,7 @@ static int eth_configure(int n, void *in + struct uml_net *device; + struct net_device *dev; + struct uml_net_private *lp; +- int err, size; ++ int save, err, size; + + size = transport->private_size + sizeof(struct uml_net_private) + + sizeof(((struct uml_net_private *) 0)->user); +@@ -334,12 +305,6 @@ static int eth_configure(int n, void *in + snprintf(dev->name, sizeof(dev->name), "eth%d", n); + device->dev = dev; + +- dev->hard_header = uml_net_hard_header; +- dev->rebuild_header = uml_net_rebuild_header; +- dev->hard_header_cache = uml_net_header_cache; +- dev->header_cache_update= uml_net_header_cache_update; +- dev->hard_header_parse = uml_net_header_parse; +- + (*transport->kern->init)(dev, init); + + dev->mtu = transport->user->max_packet; +@@ -362,21 +327,29 @@ static int eth_configure(int n, void *in + return 1; + lp = dev->priv; + +- INIT_LIST_HEAD(&lp->list); +- spin_lock_init(&lp->lock); +- lp->dev = dev; +- lp->fd = -1; +- lp->mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0 }; +- lp->have_mac = device->have_mac; +- lp->protocol = transport->kern->protocol; +- lp->open = transport->user->open; +- lp->close = transport->user->close; +- lp->remove = transport->user->remove; +- lp->read = transport->kern->read; +- lp->write = transport->kern->write; +- lp->add_address = transport->user->add_address; +- lp->delete_address = transport->user->delete_address; +- lp->set_mtu = transport->user->set_mtu; ++ /* lp.user is the first four bytes of the transport data, which ++ * has already been initialized. This structure assignment will ++ * overwrite that, so we make sure that .user gets overwritten with ++ * what it already has. ++ */ ++ save = lp->user[0]; ++ *lp = ((struct uml_net_private) ++ { .list = LIST_HEAD_INIT(lp->list), ++ .lock = SPIN_LOCK_UNLOCKED, ++ .dev = dev, ++ .fd = -1, ++ .mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0}, ++ .have_mac = device->have_mac, ++ .protocol = transport->kern->protocol, ++ .open = transport->user->open, ++ .close = transport->user->close, ++ .remove = transport->user->remove, ++ .read = transport->kern->read, ++ .write = transport->kern->write, ++ .add_address = transport->user->add_address, ++ .delete_address = transport->user->delete_address, ++ .set_mtu = transport->user->set_mtu, ++ .user = { save } }); + + init_timer(&lp->tl); + lp->tl.function = uml_net_user_timer_expire; +@@ -609,7 +582,8 @@ static int net_remove(char *str) + unregister_netdev(dev); + + list_del(&device->list); +- free_netdev(device); ++ kfree(device); ++ free_netdev(dev); + return(0); + } + +diff -puN arch/um/drivers/port_kern.c~uml-summa.diff arch/um/drivers/port_kern.c +--- limbo/arch/um/drivers/port_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/port_kern.c Tue Oct 21 16:42:38 2003 +@@ -6,6 +6,7 @@ + #include "linux/list.h" + #include "linux/sched.h" + #include "linux/slab.h" ++#include "linux/interrupt.h" + #include "linux/irq.h" + #include "linux/spinlock.h" + #include "linux/errno.h" +@@ -14,6 +15,7 @@ + #include "kern_util.h" + #include "kern.h" + #include "irq_user.h" ++#include "irq_kern.h" + #include "port.h" + #include "init.h" + #include "os.h" +@@ -44,7 +46,7 @@ struct connection { + struct port_list *port; + }; + +-static void pipe_interrupt(int irq, void *data, struct pt_regs *regs) ++static irqreturn_t pipe_interrupt(int irq, void *data, struct pt_regs *regs) + { + struct connection *conn = data; + int fd; +@@ -52,7 +54,7 @@ static void pipe_interrupt(int irq, void + fd = os_rcv_fd(conn->socket[0], &conn->helper_pid); + if(fd < 0){ + if(fd == -EAGAIN) +- return; ++ return(IRQ_NONE); + + printk(KERN_ERR "pipe_interrupt : os_rcv_fd returned %d\n", + -fd); +@@ -65,6 +67,7 @@ static void pipe_interrupt(int irq, void + list_add(&conn->list, &conn->port->connections); + + up(&conn->port->sem); ++ return(IRQ_HANDLED); + } + + static int port_accept(struct port_list *port) +@@ -138,12 +141,13 @@ void port_work_proc(void *unused) + + DECLARE_WORK(port_work, port_work_proc, NULL); + +-static void port_interrupt(int irq, void *data, struct pt_regs *regs) ++static irqreturn_t port_interrupt(int irq, void *data, struct pt_regs *regs) + { + struct port_list *port = data; + + port->has_connection = 1; + schedule_work(&port_work); ++ return(IRQ_HANDLED); + } + + void *port_data(int port_num) +diff -puN arch/um/drivers/ssl.c~uml-summa.diff arch/um/drivers/ssl.c +--- limbo/arch/um/drivers/ssl.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/ssl.c Tue Oct 21 16:42:38 2003 +@@ -53,8 +53,9 @@ static int ssl_remove(char *str); + + static struct line_driver driver = { + .name = "UML serial line", +- .devfs_name = "tts/%d", +- .major = TTYAUX_MAJOR, ++ .device_name = "ttS", ++ .devfs_name = "tts/", ++ .major = TTY_MAJOR, + .minor_start = 64, + .type = TTY_DRIVER_TYPE_SERIAL, + .subtype = 0, +diff -puN arch/um/drivers/stdio_console.c~uml-summa.diff arch/um/drivers/stdio_console.c +--- limbo/arch/um/drivers/stdio_console.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/stdio_console.c Tue Oct 21 16:42:38 2003 +@@ -83,7 +83,8 @@ static int con_remove(char *str); + + static struct line_driver driver = { + .name = "UML console", +- .devfs_name = "vc/%d", ++ .device_name = "tty", ++ .devfs_name = "vc/", + .major = TTY_MAJOR, + .minor_start = 0, + .type = TTY_DRIVER_TYPE_CONSOLE, +@@ -159,6 +160,15 @@ static int chars_in_buffer(struct tty_st + + static int con_init_done = 0; + ++static struct tty_operations console_ops = { ++ .open = con_open, ++ .close = con_close, ++ .write = con_write, ++ .chars_in_buffer = chars_in_buffer, ++ .set_termios = set_termios, ++ .write_room = line_write_room, ++}; ++ + int stdio_init(void) + { + char *new_title; +@@ -166,7 +176,8 @@ int stdio_init(void) + printk(KERN_INFO "Initializing stdio console driver\n"); + + console_driver = line_register_devfs(&console_lines, &driver, +- &console_ops, vts, sizeof(vts)/sizeof(vts[0])); ++ &console_ops, vts, ++ sizeof(vts)/sizeof(vts[0])); + + lines_init(vts, sizeof(vts)/sizeof(vts[0])); + +@@ -188,15 +199,6 @@ static void console_write(struct console + if(con_init_done) up(&vts[console->index].sem); + } + +-static struct tty_operations console_ops = { +- .open = con_open, +- .close = con_close, +- .write = con_write, +- .chars_in_buffer = chars_in_buffer, +- .set_termios = set_termios, +- .write_room = line_write_room, +-}; +- + static struct tty_driver *console_device(struct console *c, int *index) + { + *index = c->index; +@@ -212,12 +214,14 @@ static struct console stdiocons = INIT_C + console_device, console_setup, + CON_PRINTBUFFER); + +-static void __init stdio_console_init(void) ++static int __init stdio_console_init(void) + { + INIT_LIST_HEAD(&vts[0].chan_list); + list_add(&init_console_chan.list, &vts[0].chan_list); + register_console(&stdiocons); ++ return(0); + } ++ + console_initcall(stdio_console_init); + + static int console_chan_setup(char *str) +diff -puN arch/um/drivers/ubd_kern.c~uml-summa.diff arch/um/drivers/ubd_kern.c +--- limbo/arch/um/drivers/ubd_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/ubd_kern.c Tue Oct 21 16:42:38 2003 +@@ -8,15 +8,23 @@ + * old style ubd by setting UBD_SHIFT to 0 + * 2002-09-27...2002-10-18 massive tinkering for 2.5 + * partitions have changed in 2.5 ++ * 2003-01-29 more tinkering for 2.5.59-1 ++ * This should now address the sysfs problems and has ++ * the symlink for devfs to allow for booting with ++ * the common /dev/ubd/discX/... names rather than ++ * only /dev/ubdN/discN this version also has lots of ++ * clean ups preparing for ubd-many. ++ * James McMechan + */ + + #define MAJOR_NR UBD_MAJOR +-#define UBD_SHIFT 4 ++#define UBD_SHIFT 0 + + #include "linux/config.h" + #include "linux/module.h" + #include "linux/blkdev.h" + #include "linux/hdreg.h" ++#include "linux/interrupt.h" + #include "linux/init.h" + #include "linux/devfs_fs_kernel.h" + #include "linux/cdrom.h" +@@ -28,6 +36,7 @@ + #include "linux/blkpg.h" + #include "linux/genhd.h" + #include "linux/spinlock.h" ++#include "linux/bitops.h" + #include "asm/segment.h" + #include "asm/uaccess.h" + #include "asm/irq.h" +@@ -40,6 +49,7 @@ + #include "mconsole_kern.h" + #include "init.h" + #include "irq_user.h" ++#include "irq_kern.h" + #include "ubd_user.h" + #include "2_5compat.h" + #include "os.h" +@@ -47,7 +57,10 @@ + static spinlock_t ubd_io_lock = SPIN_LOCK_UNLOCKED; + static spinlock_t ubd_lock = SPIN_LOCK_UNLOCKED; + +-static void (*do_ubd)(void); ++/* We set this when we asked io thread to do some work, ++ by using this flag we can avoid do_ubd_request to schedule ++ io more then once for any given request. (race seen on SMP) */ ++static long ubd_servicing; + + static int ubd_open(struct inode * inode, struct file * filp); + static int ubd_release(struct inode * inode, struct file * file); +@@ -67,7 +80,7 @@ static struct block_device_operations ub + static request_queue_t *ubd_queue; + + /* Protected by ubd_lock */ +-static int fake_major = 0; ++static int fake_major = MAJOR_NR; + + static struct gendisk *ubd_gendisk[MAX_DEV]; + static struct gendisk *fake_gendisk[MAX_DEV]; +@@ -96,12 +109,12 @@ struct cow { + + struct ubd { + char *file; +- int is_dir; + int count; + int fd; + __u64 size; + struct openflags boot_openflags; + struct openflags openflags; ++ int no_cow; + struct cow cow; + }; + +@@ -115,12 +128,12 @@ struct ubd { + + #define DEFAULT_UBD { \ + .file = NULL, \ +- .is_dir = 0, \ + .count = 0, \ + .fd = -1, \ + .size = -1, \ + .boot_openflags = OPEN_FLAGS, \ + .openflags = OPEN_FLAGS, \ ++ .no_cow = 0, \ + .cow = DEFAULT_COW, \ + } + +@@ -128,8 +141,10 @@ struct ubd ubd_dev[MAX_DEV] = { [ 0 ... + + static int ubd0_init(void) + { +- if(ubd_dev[0].file == NULL) +- ubd_dev[0].file = "root_fs"; ++ struct ubd *dev = &ubd_dev[0]; ++ ++ if(dev->file == NULL) ++ dev->file = "root_fs"; + return(0); + } + +@@ -196,19 +211,39 @@ __uml_help(fake_ide_setup, + " Create ide0 entries that map onto ubd devices.\n\n" + ); + ++static int parse_unit(char **ptr) ++{ ++ char *str = *ptr, *end; ++ int n = -1; ++ ++ if(isdigit(*str)) { ++ n = simple_strtoul(str, &end, 0); ++ if(end == str) ++ return(-1); ++ *ptr = end; ++ } ++ else if (('a' <= *str) && (*str <= 'h')) { ++ n = *str - 'a'; ++ str++; ++ *ptr = str; ++ } ++ return(n); ++} ++ + static int ubd_setup_common(char *str, int *index_out) + { ++ struct ubd *dev; + struct openflags flags = global_openflags; + char *backing_file; + int n, err; + + if(index_out) *index_out = -1; +- n = *str++; ++ n = *str; + if(n == '='){ +- static int fake_major_allowed = 1; + char *end; + int major; + ++ str++; + if(!strcmp(str, "sync")){ + global_openflags.s = 1; + return(0); +@@ -220,20 +255,14 @@ static int ubd_setup_common(char *str, i + return(1); + } + +- if(!fake_major_allowed){ +- printk(KERN_ERR "Can't assign a fake major twice\n"); +- return(1); +- } +- + err = 1; + spin_lock(&ubd_lock); +- if(!fake_major_allowed){ ++ if(fake_major != MAJOR_NR){ + printk(KERN_ERR "Can't assign a fake major twice\n"); + goto out1; + } + + fake_major = major; +- fake_major_allowed = 0; + + printk(KERN_INFO "Setting extra ubd major number to %d\n", + major); +@@ -243,25 +272,23 @@ static int ubd_setup_common(char *str, i + return(err); + } + +- if(n < '0'){ +- printk(KERN_ERR "ubd_setup : index out of range\n"); } +- +- if((n >= '0') && (n <= '9')) n -= '0'; +- else if((n >= 'a') && (n <= 'z')) n -= 'a'; +- else { +- printk(KERN_ERR "ubd_setup : device syntax invalid\n"); ++ n = parse_unit(&str); ++ if(n < 0){ ++ printk(KERN_ERR "ubd_setup : couldn't parse unit number " ++ "'%s'\n", str); + return(1); + } + if(n >= MAX_DEV){ +- printk(KERN_ERR "ubd_setup : index out of range " +- "(%d devices)\n", MAX_DEV); ++ printk(KERN_ERR "ubd_setup : index %d out of range " ++ "(%d devices)\n", n, MAX_DEV); + return(1); + } + + err = 1; + spin_lock(&ubd_lock); + +- if(ubd_dev[n].file != NULL){ ++ dev = &ubd_dev[n]; ++ if(dev->file != NULL){ + printk(KERN_ERR "ubd_setup : device already configured\n"); + goto out2; + } +@@ -276,6 +303,11 @@ static int ubd_setup_common(char *str, i + flags.s = 1; + str++; + } ++ if (*str == 'd'){ ++ dev->no_cow = 1; ++ str++; ++ } ++ + if(*str++ != '='){ + printk(KERN_ERR "ubd_setup : Expected '='\n"); + goto out2; +@@ -284,14 +316,17 @@ static int ubd_setup_common(char *str, i + err = 0; + backing_file = strchr(str, ','); + if(backing_file){ +- *backing_file = '\0'; +- backing_file++; ++ if(dev->no_cow) ++ printk(KERN_ERR "Can't specify both 'd' and a " ++ "cow file\n"); ++ else { ++ *backing_file = '\0'; ++ backing_file++; ++ } + } +- ubd_dev[n].file = str; +- if(ubd_is_dir(ubd_dev[n].file)) +- ubd_dev[n].is_dir = 1; +- ubd_dev[n].cow.file = backing_file; +- ubd_dev[n].boot_openflags = flags; ++ dev->file = str; ++ dev->cow.file = backing_file; ++ dev->boot_openflags = flags; + out2: + spin_unlock(&ubd_lock); + return(err); +@@ -321,8 +356,7 @@ __uml_help(ubd_setup, + static int fakehd_set = 0; + static int fakehd(char *str) + { +- printk(KERN_INFO +- "fakehd : Changing ubd name to \"hd\".\n"); ++ printk(KERN_INFO "fakehd : Changing ubd name to \"hd\".\n"); + fakehd_set = 1; + return 1; + } +@@ -370,7 +404,6 @@ static void ubd_handler(void) + struct request *rq = elv_next_request(ubd_queue); + int n; + +- do_ubd = NULL; + intr_count++; + n = read_ubd_fs(thread_fd, &req, sizeof(req)); + if(n != sizeof(req)){ +@@ -379,6 +412,7 @@ static void ubd_handler(void) + spin_lock(&ubd_io_lock); + end_request(rq, 0); + spin_unlock(&ubd_io_lock); ++ clear_bit(1, &ubd_servicing); + return; + } + +@@ -387,13 +421,15 @@ static void ubd_handler(void) + panic("I/O op mismatch"); + + ubd_finish(rq, req.error); ++ clear_bit(1, &ubd_servicing); + reactivate_fd(thread_fd, UBD_IRQ); + do_ubd_request(ubd_queue); + } + +-static void ubd_intr(int irq, void *dev, struct pt_regs *unused) ++static irqreturn_t ubd_intr(int irq, void *dev, struct pt_regs *unused) + { + ubd_handler(); ++ return(IRQ_HANDLED); + } + + /* Only changed by ubd_init, which is an initcall. */ +@@ -429,16 +465,18 @@ static void ubd_close(struct ubd *dev) + static int ubd_open_dev(struct ubd *dev) + { + struct openflags flags; +- int err, n, create_cow, *create_ptr; ++ char **back_ptr; ++ int err, create_cow, *create_ptr; + ++ dev->openflags = dev->boot_openflags; + create_cow = 0; + create_ptr = (dev->cow.file != NULL) ? &create_cow : NULL; +- dev->fd = open_ubd_file(dev->file, &dev->openflags, &dev->cow.file, ++ back_ptr = dev->no_cow ? NULL : &dev->cow.file; ++ dev->fd = open_ubd_file(dev->file, &dev->openflags, back_ptr, + &dev->cow.bitmap_offset, &dev->cow.bitmap_len, + &dev->cow.data_offset, create_ptr); + + if((dev->fd == -ENOENT) && create_cow){ +- n = dev - ubd_dev; + dev->fd = create_cow_file(dev->file, dev->cow.file, + dev->openflags, 1 << 9, + &dev->cow.bitmap_offset, +@@ -455,7 +493,10 @@ static int ubd_open_dev(struct ubd *dev) + if(dev->cow.file != NULL){ + err = -ENOMEM; + dev->cow.bitmap = (void *) vmalloc(dev->cow.bitmap_len); +- if(dev->cow.bitmap == NULL) goto error; ++ if(dev->cow.bitmap == NULL){ ++ printk(KERN_ERR "Failed to vmalloc COW bitmap\n"); ++ goto error; ++ } + flush_tlb_kernel_vm(); + + err = read_cow_bitmap(dev->fd, dev->cow.bitmap, +@@ -481,17 +522,31 @@ static int ubd_new_disk(int major, u64 s + + { + struct gendisk *disk; ++ char from[sizeof("ubd/nnnnn\0")], to[sizeof("discnnnnn/disc\0")]; ++ int err; + + disk = alloc_disk(1 << UBD_SHIFT); +- if (!disk) +- return -ENOMEM; ++ if(disk == NULL) ++ return(-ENOMEM); + + disk->major = major; + disk->first_minor = unit << UBD_SHIFT; + disk->fops = &ubd_blops; + set_capacity(disk, size / 512); +- sprintf(disk->disk_name, "ubd"); +- sprintf(disk->devfs_name, "ubd/disc%d", unit); ++ if(major == MAJOR_NR){ ++ sprintf(disk->disk_name, "ubd%d", unit); ++ sprintf(disk->devfs_name, "ubd/disc%d", unit); ++ sprintf(from, "ubd/%d", unit); ++ sprintf(to, "disc%d/disc", unit); ++ err = devfs_mk_symlink(from, to); ++ if(err) ++ printk("ubd_new_disk failed to make link from %s to " ++ "%s, error = %d\n", from, to, err); ++ } ++ else { ++ sprintf(disk->disk_name, "ubd_fake%d", unit); ++ sprintf(disk->devfs_name, "ubd_fake/disc%d", unit); ++ } + + disk->private_data = &ubd_dev[unit]; + disk->queue = ubd_queue; +@@ -506,10 +561,7 @@ static int ubd_add(int n) + struct ubd *dev = &ubd_dev[n]; + int err; + +- if(dev->is_dir) +- return(-EISDIR); +- +- if (!dev->file) ++ if(dev->file == NULL) + return(-ENODEV); + + if (ubd_open_dev(dev)) +@@ -523,7 +575,7 @@ static int ubd_add(int n) + if(err) + return(err); + +- if(fake_major) ++ if(fake_major != MAJOR_NR) + ubd_new_disk(fake_major, dev->size, n, + &fake_gendisk[n]); + +@@ -561,42 +613,42 @@ static int ubd_config(char *str) + return(err); + } + +-static int ubd_get_config(char *dev, char *str, int size, char **error_out) ++static int ubd_get_config(char *name, char *str, int size, char **error_out) + { +- struct ubd *ubd; ++ struct ubd *dev; + char *end; +- int major, n = 0; ++ int n, len = 0; + +- major = simple_strtoul(dev, &end, 0); +- if((*end != '\0') || (end == dev)){ +- *error_out = "ubd_get_config : didn't parse major number"; ++ n = simple_strtoul(name, &end, 0); ++ if((*end != '\0') || (end == name)){ ++ *error_out = "ubd_get_config : didn't parse device number"; + return(-1); + } + +- if((major >= MAX_DEV) || (major < 0)){ +- *error_out = "ubd_get_config : major number out of range"; ++ if((n >= MAX_DEV) || (n < 0)){ ++ *error_out = "ubd_get_config : device number out of range"; + return(-1); + } + +- ubd = &ubd_dev[major]; ++ dev = &ubd_dev[n]; + spin_lock(&ubd_lock); + +- if(ubd->file == NULL){ +- CONFIG_CHUNK(str, size, n, "", 1); ++ if(dev->file == NULL){ ++ CONFIG_CHUNK(str, size, len, "", 1); + goto out; + } + +- CONFIG_CHUNK(str, size, n, ubd->file, 0); ++ CONFIG_CHUNK(str, size, len, dev->file, 0); + +- if(ubd->cow.file != NULL){ +- CONFIG_CHUNK(str, size, n, ",", 0); +- CONFIG_CHUNK(str, size, n, ubd->cow.file, 1); ++ if(dev->cow.file != NULL){ ++ CONFIG_CHUNK(str, size, len, ",", 0); ++ CONFIG_CHUNK(str, size, len, dev->cow.file, 1); + } +- else CONFIG_CHUNK(str, size, n, "", 1); ++ else CONFIG_CHUNK(str, size, len, "", 1); + + out: + spin_unlock(&ubd_lock); +- return(n); ++ return(len); + } + + static int ubd_remove(char *str) +@@ -604,11 +656,9 @@ static int ubd_remove(char *str) + struct ubd *dev; + int n, err = -ENODEV; + +- if(!isdigit(*str)) +- return(err); /* it should be a number 0-7/a-h */ ++ n = parse_unit(&str); + +- n = *str - '0'; +- if(n >= MAX_DEV) ++ if((n < 0) || (n >= MAX_DEV)) + return(err); + + dev = &ubd_dev[n]; +@@ -669,7 +719,7 @@ int ubd_init(void) + + elevator_init(ubd_queue, &elevator_noop); + +- if (fake_major != 0) { ++ if (fake_major != MAJOR_NR) { + char name[sizeof("ubd_nnn\0")]; + + snprintf(name, sizeof(name), "ubd_%d", fake_major); +@@ -714,15 +764,9 @@ static int ubd_open(struct inode *inode, + { + struct gendisk *disk = inode->i_bdev->bd_disk; + struct ubd *dev = disk->private_data; +- int err = -EISDIR; +- +- if(dev->is_dir == 1) +- goto out; ++ int err = 0; + +- err = 0; + if(dev->count == 0){ +- dev->openflags = dev->boot_openflags; +- + err = ubd_open_dev(dev); + if(err){ + printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", +@@ -796,15 +840,6 @@ static int prepare_request(struct reques + + if(req->rq_status == RQ_INACTIVE) return(1); + +- if(dev->is_dir){ +- strcpy(req->buffer, "HOSTFS:"); +- strcat(req->buffer, dev->file); +- spin_lock(&ubd_io_lock); +- end_request(req, 1); +- spin_unlock(&ubd_io_lock); +- return(1); +- } +- + if((rq_data_dir(req) == WRITE) && !dev->openflags.w){ + printk("Write attempted on readonly ubd device %s\n", + disk->disk_name); +@@ -830,6 +865,27 @@ static int prepare_request(struct reques + io_req->cow_offset = -1; + io_req->error = 0; + ++//#define TRACE1 1 ++#ifdef TRACE1 ++ if (disk->first_minor >> disk->minor_shift == TRACE1) { ++ static unsigned long lastaccessed=-2; ++ static unsigned long written=0; ++ char *oper; ++ ++ switch ( io_req->op ) { ++ case UBD_READ: oper="READ"; break; ++ case UBD_WRITE: oper="WRITE"; break; ++ default: oper="UNKNOWN"; break; ++ } ++ if ( lastaccessed + 1 != req->sector) { ++ printk(KERN_DEBUG "Nonsequential disk %s for sector %ld, len %d, last accessed %ld contig %ld\n", oper, req->sector, nsect, lastaccessed, written); ++ written=0; ++ } ++ lastaccessed=req->sector+nsect-1; ++ written+=nsect; ++ } ++#endif ++ + if(dev->cow.file != NULL) cowify_req(io_req, dev); + return(0); + } +@@ -851,16 +907,21 @@ static void do_ubd_request(request_queue + } + } + else { +- if(do_ubd || list_empty(&q->queue_head)) return; ++ /* if there is no requests or if another thread already ++ already started async io - return */ ++ if(list_empty(&q->queue_head) || ++ test_and_set_bit(1, &ubd_servicing)) return; ++ + req = elv_next_request(q); + err = prepare_request(req, &io_req); + if(!err){ +- do_ubd = ubd_handler; + n = write_ubd_fs(thread_fd, (char *) &io_req, + sizeof(io_req)); + if(n != sizeof(io_req)) + printk("write to io thread failed, " + "errno = %d\n", -n); ++ } else { ++ clear_bit(1, &ubd_servicing); + } + } + } +diff -puN arch/um/drivers/ubd_user.c~uml-summa.diff arch/um/drivers/ubd_user.c +--- limbo/arch/um/drivers/ubd_user.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/ubd_user.c Tue Oct 21 16:42:38 2003 +@@ -24,142 +24,24 @@ + #include "user.h" + #include "ubd_user.h" + #include "os.h" ++#include "cow.h" + + #include <endian.h> + #include <byteswap.h> +-#if __BYTE_ORDER == __BIG_ENDIAN +-# define ntohll(x) (x) +-# define htonll(x) (x) +-#elif __BYTE_ORDER == __LITTLE_ENDIAN +-# define ntohll(x) bswap_64(x) +-# define htonll(x) bswap_64(x) +-#else +-#error "__BYTE_ORDER not defined" +-#endif +- +-#define PATH_LEN_V1 256 +- +-struct cow_header_v1 { +- int magic; +- int version; +- char backing_file[PATH_LEN_V1]; +- time_t mtime; +- __u64 size; +- int sectorsize; +-}; +- +-#define PATH_LEN_V2 MAXPATHLEN +- +-struct cow_header_v2 { +- unsigned long magic; +- unsigned long version; +- char backing_file[PATH_LEN_V2]; +- time_t mtime; +- __u64 size; +- int sectorsize; +-}; +- +-union cow_header { +- struct cow_header_v1 v1; +- struct cow_header_v2 v2; +-}; +- +-#define COW_MAGIC 0x4f4f4f4d /* MOOO */ +-#define COW_VERSION 2 +- +-static void sizes(__u64 size, int sectorsize, int bitmap_offset, +- unsigned long *bitmap_len_out, int *data_offset_out) +-{ +- *bitmap_len_out = (size + sectorsize - 1) / (8 * sectorsize); +- +- *data_offset_out = bitmap_offset + *bitmap_len_out; +- *data_offset_out = (*data_offset_out + sectorsize - 1) / sectorsize; +- *data_offset_out *= sectorsize; +-} +- +-static int read_cow_header(int fd, int *magic_out, char **backing_file_out, +- time_t *mtime_out, __u64 *size_out, +- int *sectorsize_out, int *bitmap_offset_out) +-{ +- union cow_header *header; +- char *file; +- int err, n; +- unsigned long version, magic; +- +- header = um_kmalloc(sizeof(*header)); +- if(header == NULL){ +- printk("read_cow_header - Failed to allocate header\n"); +- return(-ENOMEM); +- } +- err = -EINVAL; +- n = read(fd, header, sizeof(*header)); +- if(n < offsetof(typeof(header->v1), backing_file)){ +- printk("read_cow_header - short header\n"); +- goto out; +- } +- +- magic = header->v1.magic; +- if(magic == COW_MAGIC) { +- version = header->v1.version; +- } +- else if(magic == ntohl(COW_MAGIC)){ +- version = ntohl(header->v1.version); +- } +- else goto out; +- +- *magic_out = COW_MAGIC; +- +- if(version == 1){ +- if(n < sizeof(header->v1)){ +- printk("read_cow_header - failed to read V1 header\n"); +- goto out; +- } +- *mtime_out = header->v1.mtime; +- *size_out = header->v1.size; +- *sectorsize_out = header->v1.sectorsize; +- *bitmap_offset_out = sizeof(header->v1); +- file = header->v1.backing_file; +- } +- else if(version == 2){ +- if(n < sizeof(header->v2)){ +- printk("read_cow_header - failed to read V2 header\n"); +- goto out; +- } +- *mtime_out = ntohl(header->v2.mtime); +- *size_out = ntohll(header->v2.size); +- *sectorsize_out = ntohl(header->v2.sectorsize); +- *bitmap_offset_out = sizeof(header->v2); +- file = header->v2.backing_file; +- } +- else { +- printk("read_cow_header - invalid COW version\n"); +- goto out; +- } +- err = -ENOMEM; +- *backing_file_out = uml_strdup(file); +- if(*backing_file_out == NULL){ +- printk("read_cow_header - failed to allocate backing file\n"); +- goto out; +- } +- err = 0; +- out: +- kfree(header); +- return(err); +-} + + static int same_backing_files(char *from_cmdline, char *from_cow, char *cow) + { +- struct stat buf1, buf2; ++ struct stat64 buf1, buf2; + + if(from_cmdline == NULL) return(1); + if(!strcmp(from_cmdline, from_cow)) return(1); + +- if(stat(from_cmdline, &buf1) < 0){ ++ if(stat64(from_cmdline, &buf1) < 0){ + printk("Couldn't stat '%s', errno = %d\n", from_cmdline, + errno); + return(1); + } +- if(stat(from_cow, &buf2) < 0){ ++ if(stat64(from_cow, &buf2) < 0){ + printk("Couldn't stat '%s', errno = %d\n", from_cow, errno); + return(1); + } +@@ -215,118 +97,6 @@ int read_cow_bitmap(int fd, void *buf, i + return(0); + } + +-static int absolutize(char *to, int size, char *from) +-{ +- char save_cwd[256], *slash; +- int remaining; +- +- if(getcwd(save_cwd, sizeof(save_cwd)) == NULL) { +- printk("absolutize : unable to get cwd - errno = %d\n", errno); +- return(-1); +- } +- slash = strrchr(from, '/'); +- if(slash != NULL){ +- *slash = '\0'; +- if(chdir(from)){ +- *slash = '/'; +- printk("absolutize : Can't cd to '%s' - errno = %d\n", +- from, errno); +- return(-1); +- } +- *slash = '/'; +- if(getcwd(to, size) == NULL){ +- printk("absolutize : unable to get cwd of '%s' - " +- "errno = %d\n", from, errno); +- return(-1); +- } +- remaining = size - strlen(to); +- if(strlen(slash) + 1 > remaining){ +- printk("absolutize : unable to fit '%s' into %d " +- "chars\n", from, size); +- return(-1); +- } +- strcat(to, slash); +- } +- else { +- if(strlen(save_cwd) + 1 + strlen(from) + 1 > size){ +- printk("absolutize : unable to fit '%s' into %d " +- "chars\n", from, size); +- return(-1); +- } +- strcpy(to, save_cwd); +- strcat(to, "/"); +- strcat(to, from); +- } +- chdir(save_cwd); +- return(0); +-} +- +-static int write_cow_header(char *cow_file, int fd, char *backing_file, +- int sectorsize, long long *size) +-{ +- struct cow_header_v2 *header; +- struct stat64 buf; +- int err; +- +- err = os_seek_file(fd, 0); +- if(err != 0){ +- printk("write_cow_header - lseek failed, errno = %d\n", errno); +- return(-errno); +- } +- +- err = -ENOMEM; +- header = um_kmalloc(sizeof(*header)); +- if(header == NULL){ +- printk("Failed to allocate COW V2 header\n"); +- goto out; +- } +- header->magic = htonl(COW_MAGIC); +- header->version = htonl(COW_VERSION); +- +- err = -EINVAL; +- if(strlen(backing_file) > sizeof(header->backing_file) - 1){ +- printk("Backing file name \"%s\" is too long - names are " +- "limited to %d characters\n", backing_file, +- sizeof(header->backing_file) - 1); +- goto out_free; +- } +- +- if(absolutize(header->backing_file, sizeof(header->backing_file), +- backing_file)) +- goto out_free; +- +- err = stat64(header->backing_file, &buf); +- if(err < 0){ +- printk("Stat of backing file '%s' failed, errno = %d\n", +- header->backing_file, errno); +- err = -errno; +- goto out_free; +- } +- +- err = os_file_size(header->backing_file, size); +- if(err){ +- printk("Couldn't get size of backing file '%s', errno = %d\n", +- header->backing_file, -*size); +- goto out_free; +- } +- +- header->mtime = htonl(buf.st_mtime); +- header->size = htonll(*size); +- header->sectorsize = htonl(sectorsize); +- +- err = write(fd, header, sizeof(*header)); +- if(err != sizeof(*header)){ +- printk("Write of header to new COW file '%s' failed, " +- "errno = %d\n", cow_file, errno); +- goto out_free; +- } +- err = 0; +- out_free: +- kfree(header); +- out: +- return(err); +-} +- + int open_ubd_file(char *file, struct openflags *openflags, + char **backing_file_out, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out, +@@ -346,10 +116,17 @@ int open_ubd_file(char *file, struct ope + if((fd = os_open_file(file, *openflags, mode)) < 0) + return(fd); + } ++ ++ err = os_lock_file(fd, openflags->w); ++ if(err){ ++ printk("Failed to lock '%s', errno = %d\n", file, -err); ++ goto error; ++ } ++ + if(backing_file_out == NULL) return(fd); + +- err = read_cow_header(fd, &magic, &backing_file, &mtime, &size, +- §orsize, bitmap_offset_out); ++ err = read_cow_header(file_reader, &fd, &magic, &backing_file, &mtime, ++ &size, §orsize, bitmap_offset_out); + if(err && (*backing_file_out != NULL)){ + printk("Failed to read COW header from COW file \"%s\", " + "errno = %d\n", file, err); +@@ -376,12 +153,12 @@ int open_ubd_file(char *file, struct ope + if(err) goto error; + } + +- sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, +- data_offset_out); ++ cow_sizes(size, sectorsize, *bitmap_offset_out, bitmap_len_out, ++ data_offset_out); + + return(fd); + error: +- close(fd); ++ os_close_file(fd); + return(err); + } + +@@ -389,10 +166,7 @@ int create_cow_file(char *cow_file, char + int sectorsize, int *bitmap_offset_out, + unsigned long *bitmap_len_out, int *data_offset_out) + { +- __u64 blocks; +- long zero; +- int err, fd, i; +- long long size; ++ int err, fd; + + flags.c = 1; + fd = open_ubd_file(cow_file, &flags, NULL, NULL, NULL, NULL, NULL); +@@ -403,29 +177,12 @@ int create_cow_file(char *cow_file, char + goto out; + } + +- err = write_cow_header(cow_file, fd, backing_file, sectorsize, &size); +- if(err) goto out_close; +- +- blocks = (size + sectorsize - 1) / sectorsize; +- blocks = (blocks + sizeof(long) * 8 - 1) / (sizeof(long) * 8); +- zero = 0; +- for(i = 0; i < blocks; i++){ +- err = write(fd, &zero, sizeof(zero)); +- if(err != sizeof(zero)){ +- printk("Write of bitmap to new COW file '%s' failed, " +- "errno = %d\n", cow_file, errno); +- goto out_close; +- } +- } +- +- sizes(size, sectorsize, sizeof(struct cow_header_v2), +- bitmap_len_out, data_offset_out); +- *bitmap_offset_out = sizeof(struct cow_header_v2); +- +- return(fd); +- +- out_close: +- close(fd); ++ err = init_cow_file(fd, cow_file, backing_file, sectorsize, ++ bitmap_offset_out, bitmap_len_out, ++ data_offset_out); ++ if(!err) ++ return(fd); ++ os_close_file(fd); + out: + return(err); + } +@@ -448,14 +205,6 @@ int write_ubd_fs(int fd, char *buffer, i + else return(n); + } + +-int ubd_is_dir(char *file) +-{ +- struct stat64 buf; +- +- if(stat64(file, &buf) < 0) return(0); +- return(S_ISDIR(buf.st_mode)); +-} +- + void do_io(struct io_thread_req *req) + { + char *buf; +diff -puN arch/um/drivers/xterm.c~uml-summa.diff arch/um/drivers/xterm.c +--- limbo/arch/um/drivers/xterm.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/xterm.c Tue Oct 21 16:42:38 2003 +@@ -108,7 +108,7 @@ int xterm_open(int input, int output, in + } + close(fd); + +- fd = create_unix_socket(file, sizeof(file)); ++ fd = create_unix_socket(file, sizeof(file), 1); + if(fd < 0){ + printk("xterm_open : create_unix_socket failed, errno = %d\n", + -fd); +diff -puN arch/um/drivers/xterm_kern.c~uml-summa.diff arch/um/drivers/xterm_kern.c +--- limbo/arch/um/drivers/xterm_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/drivers/xterm_kern.c Tue Oct 21 16:42:38 2003 +@@ -5,9 +5,13 @@ + + #include "linux/errno.h" + #include "linux/slab.h" ++#include "linux/signal.h" ++#include "linux/interrupt.h" + #include "asm/semaphore.h" ++#include "linux/signal.h" + #include "asm/irq.h" + #include "irq_user.h" ++#include "irq_kern.h" + #include "kern_util.h" + #include "os.h" + #include "xterm.h" +@@ -19,17 +23,18 @@ struct xterm_wait { + int new_fd; + }; + +-static void xterm_interrupt(int irq, void *data, struct pt_regs *regs) ++static irqreturn_t xterm_interrupt(int irq, void *data, struct pt_regs *regs) + { + struct xterm_wait *xterm = data; + int fd; + + fd = os_rcv_fd(xterm->fd, &xterm->pid); + if(fd == -EAGAIN) +- return; ++ return(IRQ_NONE); + + xterm->new_fd = fd; + up(&xterm->sem); ++ return(IRQ_HANDLED); + } + + int xterm_fd(int socket, int *pid_out) +diff -puN arch/um/dyn.lds.S~uml-summa.diff arch/um/dyn.lds.S +--- limbo/arch/um/dyn.lds.S~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/dyn.lds.S Tue Oct 21 16:42:38 2003 +@@ -15,7 +15,11 @@ SECTIONS + . = ALIGN(4096); /* Init code and data */ + _stext = .; + __init_begin = .; +- .text.init : { *(.text.init) } ++ .init.text : { ++ _sinittext = .; ++ *(.init.text) ++ _einittext = .; ++ } + + . = ALIGN(4096); + +@@ -67,7 +71,7 @@ SECTIONS + + #include "asm/common.lds.S" + +- .data.init : { *(.data.init) } ++ init.data : { *(.init.data) } + + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but +diff -puN arch/um/include/kern_util.h~uml-summa.diff arch/um/include/kern_util.h +--- limbo/arch/um/include/kern_util.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/kern_util.h Tue Oct 21 16:42:38 2003 +@@ -63,10 +63,9 @@ extern void init_flush_vm(void); + extern void *syscall_sp(void *t); + extern void syscall_trace(void); + extern int hz(void); +-extern void idle_timer(void); ++extern void uml_idle_timer(void); + extern unsigned int do_IRQ(int irq, union uml_pt_regs *regs); + extern int external_pid(void *t); +-extern int pid_to_processor_id(int pid); + extern void boot_timer_handler(int sig); + extern void interrupt_end(void); + extern void initial_thread_cb(void (*proc)(void *), void *arg); +@@ -90,9 +89,7 @@ extern int remove_gdb(void); + extern char *uml_strdup(char *string); + extern void unprotect_kernel_mem(void); + extern void protect_kernel_mem(void); +-extern void set_kmem_end(unsigned long); + extern void uml_cleanup(void); +-extern int pid_to_processor_id(int pid); + extern void set_current(void *t); + extern void lock_signalled_task(void *t); + extern void IPI_handler(int cpu); +@@ -101,7 +98,9 @@ extern void *get_init_task(void); + extern int clear_user_proc(void *buf, int size); + extern int copy_to_user_proc(void *to, void *from, int size); + extern int copy_from_user_proc(void *to, void *from, int size); ++extern int strlen_user_proc(char *str); + extern void bus_handler(int sig, union uml_pt_regs *regs); ++extern void winch(int sig, union uml_pt_regs *regs); + extern long execute_syscall(void *r); + extern int smp_sigio_handler(void); + extern void *get_current(void); +diff -puN arch/um/include/line.h~uml-summa.diff arch/um/include/line.h +--- limbo/arch/um/include/line.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/line.h Tue Oct 21 16:42:38 2003 +@@ -9,12 +9,14 @@ + #include "linux/list.h" + #include "linux/workqueue.h" + #include "linux/tty.h" ++#include "linux/interrupt.h" + #include "asm/semaphore.h" + #include "chan_user.h" + #include "mconsole_kern.h" + + struct line_driver { + char *name; ++ char *device_name; + char *devfs_name; + short major; + short minor_start; +@@ -67,8 +69,9 @@ struct lines { + + #define LINES_INIT(n) { num : n } + +-extern void line_interrupt(int irq, void *data, struct pt_regs *unused); +-extern void line_write_interrupt(int irq, void *data, struct pt_regs *unused); ++extern irqreturn_t line_interrupt(int irq, void *data, struct pt_regs *unused); ++extern irqreturn_t line_write_interrupt(int irq, void *data, ++ struct pt_regs *unused); + extern void line_close(struct line *lines, struct tty_struct *tty); + extern int line_open(struct line *lines, struct tty_struct *tty, + struct chan_opts *opts); +diff -puN arch/um/include/mconsole.h~uml-summa.diff arch/um/include/mconsole.h +--- limbo/arch/um/include/mconsole.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/mconsole.h Tue Oct 21 16:42:38 2003 +@@ -77,6 +77,7 @@ extern void mconsole_sysrq(struct mc_req + extern void mconsole_cad(struct mc_request *req); + extern void mconsole_stop(struct mc_request *req); + extern void mconsole_go(struct mc_request *req); ++extern void mconsole_log(struct mc_request *req); + + extern int mconsole_get_request(int fd, struct mc_request *req); + extern int mconsole_notify(char *sock_name, int type, const void *data, +diff -puN arch/um/include/mem.h~uml-summa.diff arch/um/include/mem.h +--- limbo/arch/um/include/mem.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/mem.h Tue Oct 21 16:42:38 2003 +@@ -13,7 +13,6 @@ struct vm_reserved { + }; + + extern void set_usable_vm(unsigned long start, unsigned long end); +-extern void set_kmem_end(unsigned long new); + + #endif + +diff -puN arch/um/include/mem_user.h~uml-summa.diff arch/um/include/mem_user.h +--- limbo/arch/um/include/mem_user.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/mem_user.h Tue Oct 21 16:42:38 2003 +@@ -51,9 +51,6 @@ extern unsigned long task_size; + + extern int init_mem_user(void); + extern int create_mem_file(unsigned long len); +-extern void setup_range(int fd, char *driver, unsigned long start, +- unsigned long pfn, unsigned long total, int need_vm, +- struct mem_region *region, void *reserved); + extern void setup_memory(void *entry); + extern unsigned long find_iomem(char *driver, unsigned long *len_out); + extern int init_maps(struct mem_region *region); +diff -puN arch/um/include/os.h~uml-summa.diff arch/um/include/os.h +--- limbo/arch/um/include/os.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/os.h Tue Oct 21 16:42:38 2003 +@@ -103,10 +103,11 @@ extern int os_accept_connection(int fd); + extern int os_shutdown_socket(int fd, int r, int w); + extern void os_close_file(int fd); + extern int os_rcv_fd(int fd, int *helper_pid_out); +-extern int create_unix_socket(char *file, int len); ++extern int create_unix_socket(char *file, int len, int close_on_exec); + extern int os_connect_socket(char *name); + extern int os_file_type(char *file); + extern int os_file_mode(char *file, struct openflags *mode_out); ++extern int os_lock_file(int fd, int excl); + + extern unsigned long os_process_pc(int pid); + extern int os_process_parent(int pid); +@@ -120,6 +121,7 @@ extern int os_map_memory(void *virt, int + extern int os_protect_memory(void *addr, unsigned long len, + int r, int w, int x); + extern int os_unmap_memory(void *addr, int len); ++extern void os_flush_stdout(void); + + #endif + +diff -puN arch/um/include/sysdep-i386/checksum.h~uml-summa.diff arch/um/include/sysdep-i386/checksum.h +--- limbo/arch/um/include/sysdep-i386/checksum.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/sysdep-i386/checksum.h Tue Oct 21 16:42:38 2003 +@@ -6,6 +6,7 @@ + #define __UM_SYSDEP_CHECKSUM_H + + #include "linux/string.h" ++#include "linux/in6.h" + + /* + * computes the checksum of a memory block at buff, length len, +diff -puN arch/um/include/sysdep-i386/sigcontext.h~uml-summa.diff arch/um/include/sysdep-i386/sigcontext.h +--- limbo/arch/um/include/sysdep-i386/sigcontext.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/sysdep-i386/sigcontext.h Tue Oct 21 16:42:38 2003 +@@ -28,8 +28,8 @@ + */ + #define SC_START_SYSCALL(sc) do SC_EAX(sc) = -ENOSYS; while(0) + +-/* These are General Protection and Page Fault */ +-#define SEGV_IS_FIXABLE(trap) ((trap == 13) || (trap == 14)) ++/* This is Page Fault */ ++#define SEGV_IS_FIXABLE(trap) (trap == 14) + + #define SC_SEGV_IS_FIXABLE(sc) (SEGV_IS_FIXABLE(SC_TRAPNO(sc))) + +diff -puN arch/um/include/time_user.h~uml-summa.diff arch/um/include/time_user.h +--- limbo/arch/um/include/time_user.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/time_user.h Tue Oct 21 16:42:38 2003 +@@ -13,5 +13,8 @@ extern void idle_sleep(int secs); + extern void enable_timer(void); + extern unsigned long time_lock(void); + extern void time_unlock(unsigned long); ++#ifndef NSEC_PER_SEC ++#define NSEC_PER_SEC (1000000000L) ++#endif + + #endif +diff -puN arch/um/include/ubd_user.h~uml-summa.diff arch/um/include/ubd_user.h +--- limbo/arch/um/include/ubd_user.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/ubd_user.h Tue Oct 21 16:42:38 2003 +@@ -39,7 +39,6 @@ extern int read_ubd_fs(int fd, void *buf + extern int write_ubd_fs(int fd, char *buffer, int len); + extern int start_io_thread(unsigned long sp, int *fds_out); + extern void do_io(struct io_thread_req *req); +-extern int ubd_is_dir(char *file); + + static inline int ubd_test_bit(__u64 bit, unsigned char *data) + { +diff -puN arch/um/include/user.h~uml-summa.diff arch/um/include/user.h +--- limbo/arch/um/include/user.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/user.h Tue Oct 21 16:42:38 2003 +@@ -14,7 +14,7 @@ extern void *um_kmalloc_atomic(int size) + extern void kfree(void *ptr); + extern int in_aton(char *str); + extern int open_gdb_chan(void); +- ++extern int strlcpy(char *, const char *, int); + #endif + + /* +diff -puN arch/um/include/user_util.h~uml-summa.diff arch/um/include/user_util.h +--- limbo/arch/um/include/user_util.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/include/user_util.h Tue Oct 21 16:42:38 2003 +@@ -59,7 +59,6 @@ extern int wait_for_stop(int pid, int si + extern void *add_signal_handler(int sig, void (*handler)(int)); + extern int start_fork_tramp(void *arg, unsigned long temp_stack, + int clone_flags, int (*tramp)(void *)); +-extern int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags); + extern int linux_main(int argc, char **argv); + extern void set_cmdline(char *cmd); + extern void input_cb(void (*proc)(void *), void *arg, int arg_len); +@@ -90,7 +89,8 @@ extern int arch_handle_signal(int sig, u + extern int arch_fixup(unsigned long address, void *sc_ptr); + extern void forward_pending_sigio(int target); + extern int can_do_skas(void); +- ++extern void arch_init_thread(void); ++ + #endif + + /* +diff -puN arch/um/kernel/Makefile~uml-summa.diff arch/um/kernel/Makefile +--- limbo/arch/um/kernel/Makefile~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/Makefile Tue Oct 21 16:42:38 2003 +@@ -21,6 +21,8 @@ obj-$(CONFIG_TTY_LOG) += tty_log.o + obj-$(CONFIG_MODE_TT) += tt/ + obj-$(CONFIG_MODE_SKAS) += skas/ + ++clean-files := config.c ++ + user-objs-$(CONFIG_TTY_LOG) += tty_log.o + + USER_OBJS := $(filter %_user.o,$(obj-y)) $(user-objs-y) config.o helper.o \ +@@ -45,17 +47,13 @@ $(USER_OBJS) : %.o: %.c + $(obj)/frame.o: $(src)/frame.c + $(CC) $(CFLAGS_$(notdir $@)) -c -o $@ $< + +-QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }' ++QUOTE = 'my $$config=`cat $(TOPDIR)/.config`; $$config =~ s/"/\\"/g ; $$config =~ s/\n/\\n"\n"/g ; while(<STDIN>) { $$_ =~ s/CONFIG/$$config/; print $$_ }' + + $(obj)/config.c : $(src)/config.c.in $(TOPDIR)/.config + $(PERL) -e $(QUOTE) < $(src)/config.c.in > $@ + + $(obj)/config.o : $(obj)/config.c + +-clean: +- rm -f config.c +- for dir in $(subdir-y) ; do $(MAKE) -C $$dir clean; done +- + modules: + + fastdep: +diff -puN arch/um/kernel/config.c.in~uml-summa.diff arch/um/kernel/config.c.in +--- limbo/arch/um/kernel/config.c.in~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/config.c.in Tue Oct 21 16:42:38 2003 +@@ -7,9 +7,7 @@ + #include <stdlib.h> + #include "init.h" + +-static __initdata char *config = " +-CONFIG +-"; ++static __initdata char *config = "CONFIG"; + + static int __init print_config(char *line, int *add) + { +diff -puN arch/um/kernel/exec_kern.c~uml-summa.diff arch/um/kernel/exec_kern.c +--- limbo/arch/um/kernel/exec_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/exec_kern.c Tue Oct 21 16:42:38 2003 +@@ -32,10 +32,15 @@ void start_thread(struct pt_regs *regs, + CHOOSE_MODE_PROC(start_thread_tt, start_thread_skas, regs, eip, esp); + } + ++extern void log_exec(char **argv, void *tty); ++ + static int execve1(char *file, char **argv, char **env) + { + int error; + ++#ifdef CONFIG_TTY_LOG ++ log_exec(argv, current->tty); ++#endif + error = do_execve(file, argv, env, ¤t->thread.regs); + if (error == 0){ + current->ptrace &= ~PT_DTRACE; +diff -puN arch/um/kernel/init_task.c~uml-summa.diff arch/um/kernel/init_task.c +--- limbo/arch/um/kernel/init_task.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/init_task.c Tue Oct 21 16:42:38 2003 +@@ -18,6 +18,7 @@ static struct fs_struct init_fs = INIT_F + struct mm_struct init_mm = INIT_MM(init_mm); + static struct files_struct init_files = INIT_FILES; + static struct signal_struct init_signals = INIT_SIGNALS(init_signals); ++static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); + + EXPORT_SYMBOL(init_mm); + +@@ -43,26 +44,12 @@ union thread_union init_thread_union + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +-struct task_struct *alloc_task_struct(void) +-{ +- return((struct task_struct *) +- __get_free_pages(GFP_KERNEL, CONFIG_KERNEL_STACK_ORDER)); +-} +- + void unprotect_stack(unsigned long stack) + { + protect_memory(stack, (1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE, + 1, 1, 0, 1); + } + +-void free_task_struct(struct task_struct *task) +-{ +- /* free_pages decrements the page counter and only actually frees +- * the pages if they are now not accessed by anything. +- */ +- free_pages((unsigned long) task, CONFIG_KERNEL_STACK_ORDER); +-} +- + /* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically +diff -puN arch/um/kernel/irq.c~uml-summa.diff arch/um/kernel/irq.c +--- limbo/arch/um/kernel/irq.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/irq.c Tue Oct 21 16:42:38 2003 +@@ -29,6 +29,7 @@ + #include "user_util.h" + #include "kern_util.h" + #include "irq_user.h" ++#include "irq_kern.h" + + static void register_irq_proc (unsigned int irq); + +@@ -83,65 +84,52 @@ struct hw_interrupt_type no_irq_type = { + end_none + }; + +-/* Not changed */ +-volatile unsigned long irq_err_count; +- + /* + * Generic, controller-independent functions: + */ + +-int get_irq_list(char *buf) ++int show_interrupts(struct seq_file *p, void *v) + { + int i, j; +- unsigned long flags; + struct irqaction * action; +- char *p = buf; ++ unsigned long flags; + +- p += sprintf(p, " "); +- for (j=0; j<num_online_cpus(); j++) +- p += sprintf(p, "CPU%d ",j); +- *p++ = '\n'; ++ seq_printf(p, " "); ++ for (j=0; j<NR_CPUS; j++) ++ if (cpu_online(j)) ++ seq_printf(p, "CPU%d ",j); ++ seq_putc(p, '\n'); + + for (i = 0 ; i < NR_IRQS ; i++) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) +- goto end; +- p += sprintf(p, "%3d: ",i); ++ goto skip; ++ seq_printf(p, "%3d: ",i); + #ifndef CONFIG_SMP +- p += sprintf(p, "%10u ", kstat_irqs(i)); ++ seq_printf(p, "%10u ", kstat_irqs(i)); + #else +- for (j = 0; j < num_online_cpus(); j++) +- p += sprintf(p, "%10u ", +- kstat_cpu(cpu_logical_map(j)).irqs[i]); ++ for (j = 0; j < NR_CPUS; j++) ++ if (cpu_online(j)) ++ seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + #endif +- p += sprintf(p, " %14s", irq_desc[i].handler->typename); +- p += sprintf(p, " %s", action->name); ++ seq_printf(p, " %14s", irq_desc[i].handler->typename); ++ seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) +- p += sprintf(p, ", %s", action->name); +- *p++ = '\n'; +- end: ++ seq_printf(p, ", %s", action->name); ++ ++ seq_putc(p, '\n'); ++skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } +- p += sprintf(p, "\n"); +-#ifdef notdef +-#ifdef CONFIG_SMP +- p += sprintf(p, "LOC: "); +- for (j = 0; j < num_online_cpus(); j++) +- p += sprintf(p, "%10u ", +- apic_timer_irqs[cpu_logical_map(j)]); +- p += sprintf(p, "\n"); +-#endif +-#endif +- p += sprintf(p, "ERR: %10lu\n", irq_err_count); +- return p - buf; +-} ++ seq_printf(p, "NMI: "); ++ for (j = 0; j < NR_CPUS; j++) ++ if (cpu_online(j)) ++ seq_printf(p, "%10u ", nmi_count(j)); ++ seq_putc(p, '\n'); + +- +-int show_interrupts(struct seq_file *p, void *v) +-{ +- return(0); ++ return 0; + } + + /* +@@ -282,13 +270,12 @@ unsigned int do_IRQ(int irq, union uml_p + * 0 return value means that this irq is already being + * handled by some other CPU. (or is disabled) + */ +- int cpu = smp_processor_id(); + irq_desc_t *desc = irq_desc + irq; + struct irqaction * action; + unsigned int status; + + irq_enter(); +- kstat_cpu(cpu).irqs[irq]++; ++ kstat_this_cpu.irqs[irq]++; + spin_lock(&desc->lock); + desc->handler->ack(irq); + /* +@@ -385,7 +372,7 @@ out: + */ + + int request_irq(unsigned int irq, +- void (*handler)(int, void *, struct pt_regs *), ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, + const char * devname, + void *dev_id) +@@ -433,15 +420,19 @@ int request_irq(unsigned int irq, + EXPORT_SYMBOL(request_irq); + + int um_request_irq(unsigned int irq, int fd, int type, +- void (*handler)(int, void *, struct pt_regs *), ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), + unsigned long irqflags, const char * devname, + void *dev_id) + { +- int retval; ++ int err; + +- retval = request_irq(irq, handler, irqflags, devname, dev_id); +- if(retval) return(retval); +- return(activate_fd(irq, fd, type, dev_id)); ++ err = request_irq(irq, handler, irqflags, devname, dev_id); ++ if(err) ++ return(err); ++ ++ if(fd != -1) ++ err = activate_fd(irq, fd, type, dev_id); ++ return(err); + } + + /* this was setup_x86_irq but it seems pretty generic */ +@@ -659,7 +650,7 @@ static int prof_cpu_mask_read_proc (char + return -EINVAL; + tmp = *mask; + for (k = 0; k < sizeof(cpumask_t)/sizeof(u16); ++k) { +- int j = sprintf(page, "%04hx", cpus_coerce(tmp)); ++ int j = sprintf(page, "%04hx", (short) cpus_coerce(tmp)); + len += j; + page += j; + cpus_shift_right(tmp, tmp, 16); +diff -puN arch/um/kernel/mem.c~uml-summa.diff arch/um/kernel/mem.c +--- limbo/arch/um/kernel/mem.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/mem.c Tue Oct 21 16:42:38 2003 +@@ -120,11 +120,6 @@ unsigned long get_kmem_end(void) + return(kmem_top); + } + +-void set_kmem_end(unsigned long new) +-{ +- kmem_top = new; +-} +- + #ifdef CONFIG_HIGHMEM + /* Changed during early boot */ + pte_t *kmap_pte; +@@ -222,7 +217,7 @@ static int setup_one_range(int fd, char + if(regions[i] == NULL) break; + } + if(i == NREGIONS){ +- printk("setup_range : no free regions\n"); ++ printk("setup_one_range : no free regions\n"); + i = -1; + goto out; + } +@@ -231,7 +226,9 @@ static int setup_one_range(int fd, char + fd = create_mem_file(len); + + if(region == NULL){ +- region = alloc_bootmem_low_pages(sizeof(*region)); ++ if(kmalloc_ok) ++ region = kmalloc(sizeof(*region), GFP_KERNEL); ++ else region = alloc_bootmem_low_pages(sizeof(*region)); + if(region == NULL) + panic("Failed to allocating mem_region"); + } +@@ -532,9 +529,9 @@ int nregions(void) + return(NREGIONS); + } + +-void setup_range(int fd, char *driver, unsigned long start, unsigned long pfn, +- unsigned long len, int need_vm, struct mem_region *region, +- void *reserved) ++static void setup_range(int fd, char *driver, unsigned long start, ++ unsigned long pfn, unsigned long len, int need_vm, ++ struct mem_region *region, void *reserved) + { + int i, cur; + +diff -puN arch/um/kernel/mem_user.c~uml-summa.diff arch/um/kernel/mem_user.c +--- limbo/arch/um/kernel/mem_user.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/mem_user.c Tue Oct 21 16:42:38 2003 +@@ -111,6 +111,11 @@ int setup_region(struct mem_region *regi + offset = 0; + } + ++ if(offset >= region->len){ ++ printf("%ld bytes of physical memory is insufficient\n", ++ region->len); ++ exit(1); ++ } + loc = mmap(start, region->len - offset, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, region->fd, offset); + if(loc != start){ +@@ -122,26 +127,26 @@ int setup_region(struct mem_region *regi + + static int __init parse_iomem(char *str, int *add) + { +- struct stat buf; ++ struct stat64 buf; + char *file, *driver; + int fd; + + driver = str; + file = strchr(str,','); + if(file == NULL){ +- printk("parse_iomem : failed to parse iomem\n"); ++ printf("parse_iomem : failed to parse iomem\n"); + return(1); + } + *file = '\0'; + file++; + fd = os_open_file(file, of_rdwr(OPENFLAGS()), 0); + if(fd < 0){ +- printk("parse_iomem - Couldn't open io file, errno = %d\n", ++ printf("parse_iomem - Couldn't open io file, errno = %d\n", + errno); + return(1); + } +- if(fstat(fd, &buf) < 0) { +- printk("parse_iomem - cannot fstat file, errno = %d\n", errno); ++ if(fstat64(fd, &buf) < 0) { ++ printf("parse_iomem - cannot fstat file, errno = %d\n", errno); + return(1); + } + add_iomem(driver, fd, buf.st_size); +diff -puN arch/um/kernel/process.c~uml-summa.diff arch/um/kernel/process.c +--- limbo/arch/um/kernel/process.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/process.c Tue Oct 21 16:42:38 2003 +@@ -72,7 +72,6 @@ void init_new_thread_signals(int altstac + SIGUSR1, SIGIO, SIGWINCH, SIGALRM, SIGVTALRM, -1); + set_handler(SIGUSR2, (__sighandler_t) sig_handler, + SA_NOMASK | flags, -1); +- (void) CHOOSE_MODE(signal(SIGCHLD, SIG_IGN), (void *) 0); + signal(SIGHUP, SIG_IGN); + + init_irq_signals(altstack); +@@ -127,7 +126,8 @@ int start_fork_tramp(void *thread_arg, u + if(err < 0) panic("Waiting for outer trampoline failed - errno = %d", + errno); + if(!WIFSIGNALED(status) || (WTERMSIG(status) != SIGKILL)) +- panic("outer trampoline didn't exit with SIGKILL"); ++ panic("outer trampoline didn't exit with SIGKILL, " ++ "status = %d", status); + + return(arg.pid); + } +@@ -229,11 +229,11 @@ void __init check_ptrace(void) + + int run_kernel_thread(int (*fn)(void *), void *arg, void **jmp_ptr) + { +- jmp_buf buf; ++ sigjmp_buf buf; + int n; + + *jmp_ptr = &buf; +- n = setjmp(buf); ++ n = sigsetjmp(buf,1); + if(n != 0) + return(n); + (*fn)(arg); +diff -puN arch/um/kernel/process_kern.c~uml-summa.diff arch/um/kernel/process_kern.c +--- limbo/arch/um/kernel/process_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/process_kern.c Tue Oct 21 16:42:38 2003 +@@ -26,6 +26,7 @@ + #include "asm/spinlock.h" + #include "asm/uaccess.h" + #include "asm/user.h" ++#include "asm/io.h" + #include "user_util.h" + #include "kern_util.h" + #include "kern.h" +@@ -52,17 +53,12 @@ struct cpu_task cpu_tasks[NR_CPUS] = { [ + + struct task_struct *get_task(int pid, int require) + { +- struct task_struct *task, *ret; ++ struct task_struct *ret; + +- ret = NULL; + read_lock(&tasklist_lock); +- for_each_process(task){ +- if(task->pid == pid){ +- ret = task; +- break; +- } +- } ++ ret = find_task_by_pid(pid); + read_unlock(&tasklist_lock); ++ + if(require && (ret == NULL)) panic("get_task couldn't find a task\n"); + return(ret); + } +@@ -103,13 +99,14 @@ unsigned long alloc_stack(int order, int + + int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + { +- struct task_struct *p; ++ int pid; + + current->thread.request.u.thread.proc = fn; + current->thread.request.u.thread.arg = arg; +- p = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL); +- if(IS_ERR(p)) panic("do_fork failed in kernel_thread"); +- return(p->pid); ++ pid = do_fork(CLONE_VM | flags, 0, NULL, 0, NULL, NULL); ++ if(pid < 0) ++ panic("do_fork failed in kernel_thread, errno = %d", pid); ++ return(pid); + } + + void switch_mm(struct mm_struct *prev, struct mm_struct *next, +@@ -129,7 +126,7 @@ void set_current(void *t) + { external_pid(task), task }); + } + +-void *switch_to(void *prev, void *next, void *last) ++void *_switch_to(void *prev, void *next, void *last) + { + return(CHOOSE_MODE(switch_to_tt(prev, next), + switch_to_skas(prev, next))); +@@ -149,7 +146,7 @@ void release_thread(struct task_struct * + void exit_thread(void) + { + CHOOSE_MODE(exit_thread_tt(), exit_thread_skas()); +- unprotect_stack((unsigned long) current->thread_info); ++ unprotect_stack((unsigned long) current_thread); + } + + void *get_current(void) +@@ -157,6 +154,10 @@ void *get_current(void) + return(current); + } + ++void prepare_to_copy(struct task_struct *tsk) ++{ ++} ++ + int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, + unsigned long stack_top, struct task_struct * p, + struct pt_regs *regs) +@@ -190,7 +191,7 @@ int current_pid(void) + + void default_idle(void) + { +- idle_timer(); ++ uml_idle_timer(); + + atomic_inc(&init_mm.mm_count); + current->mm = &init_mm; +@@ -251,11 +252,12 @@ void *um_virt_to_phys(struct task_struct + char *current_cmd(void) + { + #if defined(CONFIG_SMP) || defined(CONFIG_HIGHMEM) +- return("(Unknown)"); ++ void *addr = virt_to_phys( current->comm); + #else + void *addr = um_virt_to_phys(current, current->mm->arg_start, NULL); + return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr); + #endif ++ return IS_ERR(addr) ? "(Unknown)": __va((unsigned long) addr); + } + + void force_sigbus(void) +@@ -367,10 +369,15 @@ int clear_user_proc(void *buf, int size) + return(clear_user(buf, size)); + } + ++int strlen_user_proc(char *str) ++{ ++ return(strlen_user(str)); ++} ++ + int smp_sigio_handler(void) + { + #ifdef CONFIG_SMP +- int cpu = current->thread_info->cpu; ++ int cpu = current_thread->cpu; + IPI_handler(cpu); + if(cpu != 0) + return(1); +@@ -385,7 +392,7 @@ int um_in_interrupt(void) + + int cpu(void) + { +- return(current->thread_info->cpu); ++ return(current_thread->cpu); + } + + /* +diff -puN arch/um/kernel/ptrace.c~uml-summa.diff arch/um/kernel/ptrace.c +--- limbo/arch/um/kernel/ptrace.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/ptrace.c Tue Oct 21 16:42:38 2003 +@@ -311,11 +311,8 @@ void syscall_trace(void) + + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ +- current->exit_code = SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) +- ? 0x80 : 0); +- current->state = TASK_STOPPED; +- notify_parent(current, SIGCHLD); +- schedule(); ++ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ++ ? 0x80 : 0)); + + /* + * this isn't the same as continuing with a signal, but it will do +diff -puN arch/um/kernel/sigio_kern.c~uml-summa.diff arch/um/kernel/sigio_kern.c +--- limbo/arch/um/kernel/sigio_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/sigio_kern.c Tue Oct 21 16:42:38 2003 +@@ -6,18 +6,21 @@ + #include "linux/kernel.h" + #include "linux/list.h" + #include "linux/slab.h" +-#include "asm/irq.h" ++#include "linux/signal.h" ++#include "linux/interrupt.h" + #include "init.h" + #include "sigio.h" + #include "irq_user.h" ++#include "irq_kern.h" + + /* Protected by sigio_lock() called from write_sigio_workaround */ + static int sigio_irq_fd = -1; + +-void sigio_interrupt(int irq, void *data, struct pt_regs *unused) ++irqreturn_t sigio_interrupt(int irq, void *data, struct pt_regs *unused) + { + read_sigio_fd(sigio_irq_fd); + reactivate_fd(sigio_irq_fd, SIGIO_WRITE_IRQ); ++ return(IRQ_HANDLED); + } + + int write_sigio_irq(int fd) +diff -puN arch/um/kernel/signal_kern.c~uml-summa.diff arch/um/kernel/signal_kern.c +--- limbo/arch/um/kernel/signal_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/signal_kern.c Tue Oct 21 16:42:38 2003 +@@ -36,7 +36,7 @@ static void force_segv(int sig) + if(sig == SIGSEGV){ + struct k_sigaction *ka; + +- ka = ¤t->sig->action[SIGSEGV - 1]; ++ ka = ¤t->sighand->action[SIGSEGV - 1]; + ka->sa.sa_handler = SIG_DFL; + } + force_sig(SIGSEGV, current); +@@ -142,7 +142,7 @@ static int kern_do_signal(struct pt_regs + return(0); + + /* Whee! Actually deliver the signal. */ +- ka = ¤t->sig->action[sig -1 ]; ++ ka = ¤t->sighand->action[sig -1 ]; + err = handle_signal(regs, sig, ka, &info, oldset, error); + if(!err) return(1); + +@@ -201,7 +201,7 @@ int sys_sigsuspend(int history0, int his + } + } + +-int sys_rt_sigsuspend(sigset_t *unewset, size_t sigsetsize) ++int sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize) + { + sigset_t saveset, newset; + +@@ -227,6 +227,42 @@ int sys_rt_sigsuspend(sigset_t *unewset, + } + } + ++int sys_sigaction(int sig, const struct old_sigaction __user *act, ++ struct old_sigaction __user *oact) ++{ ++ struct k_sigaction new_ka, old_ka; ++ int ret; ++ ++ if (act) { ++ old_sigset_t mask; ++ if (verify_area(VERIFY_READ, act, sizeof(*act)) || ++ __get_user(new_ka.sa.sa_handler, &act->sa_handler) || ++ __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) ++ return -EFAULT; ++ __get_user(new_ka.sa.sa_flags, &act->sa_flags); ++ __get_user(mask, &act->sa_mask); ++ siginitset(&new_ka.sa.sa_mask, mask); ++ } ++ ++ ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); ++ ++ if (!ret && oact) { ++ if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || ++ __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || ++ __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) ++ return -EFAULT; ++ __put_user(old_ka.sa.sa_flags, &oact->sa_flags); ++ __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); ++ } ++ ++ return ret; ++} ++ ++int sys_sigaltstack(const stack_t *uss, stack_t *uoss) ++{ ++ return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); ++} ++ + static int copy_sc_from_user(struct pt_regs *to, void *from, + struct arch_frame_data *arch) + { +@@ -239,8 +275,8 @@ static int copy_sc_from_user(struct pt_r + + int sys_sigreturn(struct pt_regs regs) + { +- void *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); +- void *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); ++ void __user *sc = sp_to_sc(PT_REGS_SP(¤t->thread.regs)); ++ void __user *mask = sp_to_mask(PT_REGS_SP(¤t->thread.regs)); + int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); + + spin_lock_irq(¤t->sighand->siglock); +@@ -257,7 +293,8 @@ int sys_sigreturn(struct pt_regs regs) + + int sys_rt_sigreturn(struct pt_regs regs) + { +- struct ucontext *uc = sp_to_uc(PT_REGS_SP(¤t->thread.regs)); ++ unsigned long sp = PT_REGS_SP(¤t->thread.regs); ++ struct ucontext __user *uc = sp_to_uc(sp); + void *fp; + int sig_size = _NSIG_WORDS * sizeof(unsigned long); + +diff -puN arch/um/kernel/skas/Makefile~uml-summa.diff arch/um/kernel/skas/Makefile +--- limbo/arch/um/kernel/skas/Makefile~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/skas/Makefile Tue Oct 21 16:42:38 2003 +@@ -7,18 +7,22 @@ obj-y = exec_kern.o exec_user.o mem.o me + process_kern.o syscall_kern.o syscall_user.o time.o tlb.o trap_user.o \ + sys-$(SUBARCH)/ + ++host-progs := util/mk_ptregs ++clean-files := include/skas_ptregs.h ++ + USER_OBJS = $(filter %_user.o,$(obj-y)) process.o time.o + USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) + +-include/skas_ptregs.h : util/mk_ptregs +- util/mk_ptregs > $@ +- +-util/mk_ptregs : +- $(MAKE) -C util ++$(TOPDIR)/arch/um/include/skas_ptregs.h : $(src)/util/mk_ptregs ++ @echo -n ' Generating $@' ++ @$< > $@.tmp ++ @if [ -r $@ ] && cmp -s $@ $@.tmp; then \ ++ echo ' (unchanged)'; \ ++ rm -f $@.tmp; \ ++ else \ ++ echo ' (updated)'; \ ++ mv -f $@.tmp $@; \ ++ fi + + $(USER_OBJS) : %.o: %.c + $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< +- +-clean : +- $(MAKE) -C util clean +- $(RM) -f include/skas_ptregs.h +diff -puN arch/um/kernel/skas/include/mode.h~uml-summa.diff arch/um/kernel/skas/include/mode.h +--- limbo/arch/um/kernel/skas/include/mode.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/skas/include/mode.h Tue Oct 21 16:42:38 2003 +@@ -20,6 +20,7 @@ extern void sig_handler_common_skas(int + extern void halt_skas(void); + extern void reboot_skas(void); + extern void kill_off_processes_skas(void); ++extern int is_skas_winch(int pid, int fd, void *data); + + #endif + +diff -puN arch/um/kernel/skas/include/uaccess.h~uml-summa.diff arch/um/kernel/skas/include/uaccess.h +--- limbo/arch/um/kernel/skas/include/uaccess.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/skas/include/uaccess.h Tue Oct 21 16:42:38 2003 +@@ -19,7 +19,7 @@ + #define access_ok_skas(type, addr, size) \ + ((segment_eq(get_fs(), KERNEL_DS)) || \ + (((unsigned long) (addr) < TASK_SIZE) && \ +- ((unsigned long) (addr) + (size) < TASK_SIZE))) ++ ((unsigned long) (addr) + (size) <= TASK_SIZE))) + + static inline int verify_area_skas(int type, const void * addr, + unsigned long size) +diff -puN arch/um/kernel/skas/process.c~uml-summa.diff arch/um/kernel/skas/process.c +--- limbo/arch/um/kernel/skas/process.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/skas/process.c Tue Oct 21 16:42:38 2003 +@@ -4,6 +4,7 @@ + */ + + #include <stdlib.h> ++#include <unistd.h> + #include <errno.h> + #include <signal.h> + #include <setjmp.h> +@@ -24,6 +25,16 @@ + #include "os.h" + #include "proc_mm.h" + #include "skas_ptrace.h" ++#include "chan_user.h" ++ ++int is_skas_winch(int pid, int fd, void *data) ++{ ++ if(pid != getpid()) ++ return(0); ++ ++ register_winch_irq(-1, fd, -1, data); ++ return(1); ++} + + unsigned long exec_regs[FRAME_SIZE]; + unsigned long exec_fp_regs[HOST_FP_SIZE]; +@@ -48,11 +59,11 @@ static void handle_trap(int pid, union u + int err, syscall_nr, status; + + syscall_nr = PT_SYSCALL_NR(regs->skas.regs); ++ UPT_SYSCALL_NR(regs) = syscall_nr; + if(syscall_nr < 1){ + relay_signal(SIGTRAP, regs); + return; + } +- UPT_SYSCALL_NR(regs) = syscall_nr; + + err = ptrace(PTRACE_POKEUSER, pid, PT_SYSCALL_NR_OFFSET, __NR_getpid); + if(err < 0) +@@ -72,8 +83,6 @@ static void handle_trap(int pid, union u + handle_syscall(regs); + } + +-int userspace_pid; +- + static int userspace_tramp(void *arg) + { + init_new_thread_signals(0); +@@ -83,6 +92,8 @@ static int userspace_tramp(void *arg) + return(0); + } + ++int userspace_pid; ++ + void start_userspace(void) + { + void *stack; +@@ -149,6 +160,7 @@ void userspace(union uml_pt_regs *regs) + case SIGILL: + case SIGBUS: + case SIGFPE: ++ case SIGWINCH: + user_signal(WSTOPSIG(status), regs); + break; + default: +@@ -172,12 +184,12 @@ void userspace(union uml_pt_regs *regs) + void new_thread(void *stack, void **switch_buf_ptr, void **fork_buf_ptr, + void (*handler)(int)) + { +- jmp_buf switch_buf, fork_buf; ++ sigjmp_buf switch_buf, fork_buf; + + *switch_buf_ptr = &switch_buf; + *fork_buf_ptr = &fork_buf; + +- if(setjmp(fork_buf) == 0) ++ if(sigsetjmp(fork_buf,1) == 0) + new_thread_proc(stack, handler); + + remove_sigstack(); +@@ -185,12 +197,12 @@ void new_thread(void *stack, void **swit + + void thread_wait(void *sw, void *fb) + { +- jmp_buf buf, **switch_buf = sw, *fork_buf; ++ sigjmp_buf buf, **switch_buf = sw, *fork_buf; + + *switch_buf = &buf; + fork_buf = fb; +- if(setjmp(buf) == 0) +- longjmp(*fork_buf, 1); ++ if(sigsetjmp(buf,1) == 0) ++ siglongjmp(*fork_buf, 1); + } + + static int move_registers(int int_op, int fp_op, union uml_pt_regs *regs, +@@ -245,34 +257,34 @@ void restore_registers(union uml_pt_regs + + void switch_threads(void *me, void *next) + { +- jmp_buf my_buf, **me_ptr = me, *next_buf = next; ++ sigjmp_buf my_buf, **me_ptr = me, *next_buf = next; + + *me_ptr = &my_buf; +- if(setjmp(my_buf) == 0) +- longjmp(*next_buf, 1); ++ if(sigsetjmp(my_buf,1) == 0) ++ siglongjmp(*next_buf, 1); + } + +-static jmp_buf initial_jmpbuf; ++static sigjmp_buf initial_jmpbuf; + + /* XXX Make these percpu */ + static void (*cb_proc)(void *arg); + static void *cb_arg; +-static jmp_buf *cb_back; ++static sigjmp_buf *cb_back; + + int start_idle_thread(void *stack, void *switch_buf_ptr, void **fork_buf_ptr) + { +- jmp_buf **switch_buf = switch_buf_ptr; ++ sigjmp_buf **switch_buf = switch_buf_ptr; + int n; + + *fork_buf_ptr = &initial_jmpbuf; +- n = setjmp(initial_jmpbuf); ++ n = sigsetjmp(initial_jmpbuf,1); + if(n == 0) + new_thread_proc((void *) stack, new_thread_handler); + else if(n == 1) + remove_sigstack(); + else if(n == 2){ + (*cb_proc)(cb_arg); +- longjmp(*cb_back, 1); ++ siglongjmp(*cb_back, 1); + } + else if(n == 3){ + kmalloc_ok = 0; +@@ -282,7 +294,7 @@ int start_idle_thread(void *stack, void + kmalloc_ok = 0; + return(1); + } +- longjmp(**switch_buf, 1); ++ siglongjmp(**switch_buf, 1); + } + + void remove_sigstack(void) +@@ -297,15 +309,15 @@ void remove_sigstack(void) + + void initial_thread_cb_skas(void (*proc)(void *), void *arg) + { +- jmp_buf here; ++ sigjmp_buf here; + + cb_proc = proc; + cb_arg = arg; + cb_back = &here; + + block_signals(); +- if(setjmp(here) == 0) +- longjmp(initial_jmpbuf, 2); ++ if(sigsetjmp(here,1) == 0) ++ siglongjmp(initial_jmpbuf, 2); + unblock_signals(); + + cb_proc = NULL; +@@ -316,19 +328,20 @@ void initial_thread_cb_skas(void (*proc) + void halt_skas(void) + { + block_signals(); +- longjmp(initial_jmpbuf, 3); ++ siglongjmp(initial_jmpbuf, 3); + } + + void reboot_skas(void) + { + block_signals(); +- longjmp(initial_jmpbuf, 4); ++ siglongjmp(initial_jmpbuf, 4); + } + + int new_mm(int from) + { + struct proc_mm_op copy; +- int n, fd = os_open_file("/proc/mm", of_write(OPENFLAGS()), 0); ++ int n, fd = os_open_file("/proc/mm", ++ of_cloexec(of_write(OPENFLAGS())), 0); + + if(fd < 0) + return(-errno); +@@ -342,6 +355,7 @@ int new_mm(int from) + printk("new_mm : /proc/mm copy_segments failed, " + "errno = %d\n", errno); + } ++ + return(fd); + } + +diff -puN arch/um/kernel/skas/process_kern.c~uml-summa.diff arch/um/kernel/skas/process_kern.c +--- limbo/arch/um/kernel/skas/process_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/skas/process_kern.c Tue Oct 21 16:42:38 2003 +@@ -61,9 +61,8 @@ void new_thread_handler(int sig) + thread_wait(¤t->thread.mode.skas.switch_buf, + current->thread.mode.skas.fork_buf); + +-#ifdef CONFIG_SMP +- schedule_tail(NULL); +-#endif ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); + current->thread.prev_sched = NULL; + + n = run_kernel_thread(fn, arg, ¤t->thread.exec_buf); +@@ -93,9 +92,8 @@ void fork_handler(int sig) + current->thread.mode.skas.fork_buf); + + force_flush_all(); +-#ifdef CONFIG_SMP +- schedule_tail(current->thread.prev_sched); +-#endif ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); + current->thread.prev_sched = NULL; + unblock_signals(); + +@@ -136,7 +134,7 @@ int copy_thread_skas(int nr, unsigned lo + + void init_idle_skas(void) + { +- cpu_tasks[current->thread_info->cpu].pid = os_getpid(); ++ cpu_tasks[current_thread->cpu].pid = os_getpid(); + default_idle(); + } + +@@ -164,7 +162,7 @@ int start_uml_skas(void) + capture_signal_stack(); + + init_new_thread_signals(1); +- idle_timer(); ++ uml_idle_timer(); + + init_task.thread.request.u.thread.proc = start_kernel_proc; + init_task.thread.request.u.thread.arg = NULL; +diff -puN arch/um/kernel/skas/util/mk_ptregs.c~uml-summa.diff arch/um/kernel/skas/util/mk_ptregs.c +--- limbo/arch/um/kernel/skas/util/mk_ptregs.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/skas/util/mk_ptregs.c Tue Oct 21 16:42:38 2003 +@@ -1,3 +1,4 @@ ++#include <stdio.h> + #include <asm/ptrace.h> + #include <asm/user.h> + +diff -puN arch/um/kernel/smp.c~uml-summa.diff arch/um/kernel/smp.c +--- limbo/arch/um/kernel/smp.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/smp.c Tue Oct 21 16:42:38 2003 +@@ -23,7 +23,7 @@ + #include "os.h" + + /* CPU online map, set by smp_boot_cpus */ +-unsigned long cpu_online_map = cpumask_of_cpu(0); ++unsigned long cpu_online_map = CPU_MASK_NONE; + + EXPORT_SYMBOL(cpu_online_map); + +@@ -100,15 +100,15 @@ void smp_send_stop(void) + + printk(KERN_INFO "Stopping all CPUs..."); + for(i = 0; i < num_online_cpus(); i++){ +- if(i == current->thread_info->cpu) ++ if(i == current_thread->cpu) + continue; + write(cpu_data[i].ipi_pipe[1], "S", 1); + } + printk("done\n"); + } + +-static cpumask_t smp_commenced_mask; +-static cpumask_t smp_callin_map = CPU_MASK_NONE; ++static cpumask_t smp_commenced_mask = CPU_MASK_NONE; ++static cpumask_t cpu_callin_map = CPU_MASK_NONE; + + static int idle_proc(void *cpup) + { +@@ -123,12 +123,12 @@ static int idle_proc(void *cpup) + current->thread.mode.tt.extern_pid); + + wmb(); +- if (cpu_test_and_set(cpu, &smp_callin_map)) { ++ if (cpu_test_and_set(cpu, cpu_callin_map)) { + printk("huh, CPU#%d already present??\n", cpu); + BUG(); + } + +- while (!cpu_isset(cpu, &smp_commenced_mask)) ++ while (!cpu_isset(cpu, smp_commenced_mask)) + cpu_relax(); + + cpu_set(cpu, cpu_online_map); +@@ -143,8 +143,11 @@ static struct task_struct *idle_thread(i + + current->thread.request.u.thread.proc = idle_proc; + current->thread.request.u.thread.arg = (void *) cpu; +- new_task = do_fork(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, NULL); +- if(IS_ERR(new_task)) panic("do_fork failed in idle_thread"); ++ new_task = copy_process(CLONE_VM | CLONE_IDLETASK, 0, NULL, 0, NULL, ++ NULL); ++ if(IS_ERR(new_task)) ++ panic("copy_process failed in idle_thread, error = %ld", ++ PTR_ERR(new_task)); + + cpu_tasks[cpu] = ((struct cpu_task) + { .pid = new_task->thread.mode.tt.extern_pid, +@@ -153,6 +156,7 @@ static struct task_struct *idle_thread(i + CHOOSE_MODE(write(new_task->thread.mode.tt.switch_pipe[1], &c, + sizeof(c)), + ({ panic("skas mode doesn't support SMP"); })); ++ wake_up_forked_process(new_task); + return(new_task); + } + +@@ -160,15 +164,16 @@ void smp_prepare_cpus(unsigned int maxcp + { + struct task_struct *idle; + unsigned long waittime; +- int err, cpu; ++ int err, cpu, me = smp_processor_id(); + +- cpu_set(0, cpu_online_map); +- cpu_set(0, smp_callin_map); ++ cpu_clear(me, cpu_online_map); ++ cpu_set(me, cpu_online_map); ++ cpu_set(me, cpu_callin_map); + +- err = os_pipe(cpu_data[0].ipi_pipe, 1, 1); ++ err = os_pipe(cpu_data[me].ipi_pipe, 1, 1); + if(err) panic("CPU#0 failed to create IPI pipe, errno = %d", -err); + +- activate_ipi(cpu_data[0].ipi_pipe[0], ++ activate_ipi(cpu_data[me].ipi_pipe[0], + current->thread.mode.tt.extern_pid); + + for(cpu = 1; cpu < ncpus; cpu++){ +@@ -180,10 +185,10 @@ void smp_prepare_cpus(unsigned int maxcp + unhash_process(idle); + + waittime = 200000000; +- while (waittime-- && !cpu_isset(cpu, smp_callin_map)) ++ while (waittime-- && !cpu_isset(cpu, cpu_callin_map)) + cpu_relax(); + +- if (cpu_isset(cpu, smp_callin_map)) ++ if (cpu_isset(cpu, cpu_callin_map)) + printk("done\n"); + else printk("failed\n"); + } +@@ -273,7 +278,7 @@ int smp_call_function(void (*_func)(void + info = _info; + + for (i=0;i<NR_CPUS;i++) +- if((i != current->thread_info->cpu) && ++ if((i != current_thread->cpu) && + cpu_isset(i, cpu_online_map)) + write(cpu_data[i].ipi_pipe[1], "C", 1); + +diff -puN arch/um/kernel/sys_call_table.c~uml-summa.diff arch/um/kernel/sys_call_table.c +--- limbo/arch/um/kernel/sys_call_table.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/sys_call_table.c Tue Oct 21 16:42:38 2003 +@@ -219,15 +219,30 @@ extern syscall_handler_t sys_getdents64; + extern syscall_handler_t sys_gettid; + extern syscall_handler_t sys_readahead; + extern syscall_handler_t sys_tkill; ++extern syscall_handler_t sys_setxattr; ++extern syscall_handler_t sys_lsetxattr; ++extern syscall_handler_t sys_fsetxattr; ++extern syscall_handler_t sys_getxattr; ++extern syscall_handler_t sys_lgetxattr; ++extern syscall_handler_t sys_fgetxattr; ++extern syscall_handler_t sys_listxattr; ++extern syscall_handler_t sys_llistxattr; ++extern syscall_handler_t sys_flistxattr; ++extern syscall_handler_t sys_removexattr; ++extern syscall_handler_t sys_lremovexattr; ++extern syscall_handler_t sys_fremovexattr; + extern syscall_handler_t sys_sendfile64; + extern syscall_handler_t sys_futex; + extern syscall_handler_t sys_sched_setaffinity; + extern syscall_handler_t sys_sched_getaffinity; ++extern syscall_handler_t sys_set_thread_area; ++extern syscall_handler_t sys_get_thread_area; + extern syscall_handler_t sys_io_setup; + extern syscall_handler_t sys_io_destroy; + extern syscall_handler_t sys_io_getevents; + extern syscall_handler_t sys_io_submit; + extern syscall_handler_t sys_io_cancel; ++extern syscall_handler_t sys_fadvise64; + extern syscall_handler_t sys_exit_group; + extern syscall_handler_t sys_lookup_dcookie; + extern syscall_handler_t sys_epoll_create; +@@ -235,6 +250,21 @@ extern syscall_handler_t sys_epoll_ctl; + extern syscall_handler_t sys_epoll_wait; + extern syscall_handler_t sys_remap_file_pages; + extern syscall_handler_t sys_set_tid_address; ++extern syscall_handler_t sys_timer_create; ++extern syscall_handler_t sys_timer_settime; ++extern syscall_handler_t sys_timer_gettime; ++extern syscall_handler_t sys_timer_getoverrun; ++extern syscall_handler_t sys_timer_delete; ++extern syscall_handler_t sys_clock_settime; ++extern syscall_handler_t sys_clock_gettime; ++extern syscall_handler_t sys_clock_getres; ++extern syscall_handler_t sys_clock_nanosleep; ++extern syscall_handler_t sys_statfs64; ++extern syscall_handler_t sys_fstatfs64; ++extern syscall_handler_t sys_tgkill; ++extern syscall_handler_t sys_utimes; ++extern syscall_handler_t sys_fadvise64_64; ++extern syscall_handler_t sys_reiser4; + + #ifdef CONFIG_NFSD + #define NFSSERVCTL sys_nfsservctl +@@ -246,7 +276,7 @@ extern syscall_handler_t um_mount; + extern syscall_handler_t um_time; + extern syscall_handler_t um_stime; + +-#define LAST_GENERIC_SYSCALL __NR_set_tid_address ++#define LAST_GENERIC_SYSCALL __NR_reiser4 + + #if LAST_GENERIC_SYSCALL > LAST_ARCH_SYSCALL + #define LAST_SYSCALL LAST_GENERIC_SYSCALL +@@ -455,32 +485,37 @@ syscall_handler_t *sys_call_table[] = { + [ __NR_stat64 ] = sys_stat64, + [ __NR_lstat64 ] = sys_lstat64, + [ __NR_fstat64 ] = sys_fstat64, +- [ __NR_fcntl64 ] = sys_fcntl64, + [ __NR_getdents64 ] = sys_getdents64, ++ [ __NR_fcntl64 ] = sys_fcntl64, ++ [ 223 ] = sys_ni_syscall, + [ __NR_gettid ] = sys_gettid, + [ __NR_readahead ] = sys_readahead, +- [ __NR_setxattr ] = sys_ni_syscall, +- [ __NR_lsetxattr ] = sys_ni_syscall, +- [ __NR_fsetxattr ] = sys_ni_syscall, +- [ __NR_getxattr ] = sys_ni_syscall, +- [ __NR_lgetxattr ] = sys_ni_syscall, +- [ __NR_fgetxattr ] = sys_ni_syscall, +- [ __NR_listxattr ] = sys_ni_syscall, +- [ __NR_llistxattr ] = sys_ni_syscall, +- [ __NR_flistxattr ] = sys_ni_syscall, +- [ __NR_removexattr ] = sys_ni_syscall, +- [ __NR_lremovexattr ] = sys_ni_syscall, +- [ __NR_fremovexattr ] = sys_ni_syscall, ++ [ __NR_setxattr ] = sys_setxattr, ++ [ __NR_lsetxattr ] = sys_lsetxattr, ++ [ __NR_fsetxattr ] = sys_fsetxattr, ++ [ __NR_getxattr ] = sys_getxattr, ++ [ __NR_lgetxattr ] = sys_lgetxattr, ++ [ __NR_fgetxattr ] = sys_fgetxattr, ++ [ __NR_listxattr ] = sys_listxattr, ++ [ __NR_llistxattr ] = sys_llistxattr, ++ [ __NR_flistxattr ] = sys_flistxattr, ++ [ __NR_removexattr ] = sys_removexattr, ++ [ __NR_lremovexattr ] = sys_lremovexattr, ++ [ __NR_fremovexattr ] = sys_fremovexattr, + [ __NR_tkill ] = sys_tkill, + [ __NR_sendfile64 ] = sys_sendfile64, + [ __NR_futex ] = sys_futex, + [ __NR_sched_setaffinity ] = sys_sched_setaffinity, + [ __NR_sched_getaffinity ] = sys_sched_getaffinity, ++ [ __NR_set_thread_area ] = sys_ni_syscall, ++ [ __NR_get_thread_area ] = sys_ni_syscall, + [ __NR_io_setup ] = sys_io_setup, + [ __NR_io_destroy ] = sys_io_destroy, + [ __NR_io_getevents ] = sys_io_getevents, + [ __NR_io_submit ] = sys_io_submit, + [ __NR_io_cancel ] = sys_io_cancel, ++ [ __NR_fadvise64 ] = sys_fadvise64, ++ [ 251 ] = sys_ni_syscall, + [ __NR_exit_group ] = sys_exit_group, + [ __NR_lookup_dcookie ] = sys_lookup_dcookie, + [ __NR_epoll_create ] = sys_epoll_create, +@@ -488,6 +523,25 @@ syscall_handler_t *sys_call_table[] = { + [ __NR_epoll_wait ] = sys_epoll_wait, + [ __NR_remap_file_pages ] = sys_remap_file_pages, + [ __NR_set_tid_address ] = sys_set_tid_address, ++ [ __NR_timer_create ] = sys_timer_create, ++ [ __NR_timer_settime ] = sys_timer_settime, ++ [ __NR_timer_gettime ] = sys_timer_gettime, ++ [ __NR_timer_getoverrun ] = sys_timer_getoverrun, ++ [ __NR_timer_delete ] = sys_timer_delete, ++ [ __NR_clock_settime ] = sys_clock_settime, ++ [ __NR_clock_gettime ] = sys_clock_gettime, ++ [ __NR_clock_getres ] = sys_clock_getres, ++ [ __NR_clock_nanosleep ] = sys_clock_nanosleep, ++ [ __NR_statfs64 ] = sys_statfs64, ++ [ __NR_fstatfs64 ] = sys_fstatfs64, ++ [ __NR_tgkill ] = sys_tgkill, ++ [ __NR_utimes ] = sys_utimes, ++ [ __NR_fadvise64_64 ] = sys_fadvise64_64, ++#ifdef CONFIG_REISER4_FS ++ [ __NR_reiser4 ] = sys_reiser4, ++#else ++ [ __NR_reiser4 ] = sys_ni_syscall, ++#endif + + ARCH_SYSCALLS + [ LAST_SYSCALL + 1 ... NR_syscalls ] = +diff -puN arch/um/kernel/syscall_kern.c~uml-summa.diff arch/um/kernel/syscall_kern.c +--- limbo/arch/um/kernel/syscall_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/syscall_kern.c Tue Oct 21 16:42:38 2003 +@@ -35,39 +35,40 @@ long um_mount(char * dev_name, char * di + + long sys_fork(void) + { +- struct task_struct *p; ++ long ret; + + current->thread.forking = 1; +- p = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL); ++ ret = do_fork(SIGCHLD, 0, NULL, 0, NULL, NULL); + current->thread.forking = 0; +- return(IS_ERR(p) ? PTR_ERR(p) : p->pid); ++ return(ret); + } + +-long sys_clone(unsigned long clone_flags, unsigned long newsp) ++long sys_clone(unsigned long clone_flags, unsigned long newsp, ++ int *parent_tid, int *child_tid) + { +- struct task_struct *p; ++ long ret; + + current->thread.forking = 1; +- p = do_fork(clone_flags, newsp, NULL, 0, NULL, NULL); ++ ret = do_fork(clone_flags, newsp, NULL, 0, parent_tid, child_tid); + current->thread.forking = 0; +- return(IS_ERR(p) ? PTR_ERR(p) : p->pid); ++ return(ret); + } + + long sys_vfork(void) + { +- struct task_struct *p; ++ long ret; + + current->thread.forking = 1; +- p = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, NULL); ++ ret = do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, NULL, 0, NULL, ++ NULL); + current->thread.forking = 0; +- return(IS_ERR(p) ? PTR_ERR(p) : p->pid); ++ return(ret); + } + + /* common code for old and new mmaps */ +-static inline long do_mmap2( +- unsigned long addr, unsigned long len, +- unsigned long prot, unsigned long flags, +- unsigned long fd, unsigned long pgoff) ++long do_mmap2(struct mm_struct *mm, unsigned long addr, unsigned long len, ++ unsigned long prot, unsigned long flags, unsigned long fd, ++ unsigned long pgoff) + { + int error = -EBADF; + struct file * file = NULL; +@@ -79,9 +80,9 @@ static inline long do_mmap2( + goto out; + } + +- down_write(¤t->mm->mmap_sem); +- error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); +- up_write(¤t->mm->mmap_sem); ++ down_write(&mm->mmap_sem); ++ error = do_mmap_pgoff(mm, file, addr, len, prot, flags, pgoff); ++ up_write(&mm->mmap_sem); + + if (file) + fput(file); +@@ -93,7 +94,7 @@ long sys_mmap2(unsigned long addr, unsig + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) + { +- return do_mmap2(addr, len, prot, flags, fd, pgoff); ++ return do_mmap2(current->mm, addr, len, prot, flags, fd, pgoff); + } + + /* +@@ -120,7 +121,8 @@ int old_mmap(unsigned long addr, unsigne + if (offset & ~PAGE_MASK) + goto out; + +- err = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); ++ err = do_mmap2(current->mm, addr, len, prot, flags, fd, ++ offset >> PAGE_SHIFT); + out: + return err; + } +@@ -141,37 +143,6 @@ int sys_pipe(unsigned long * fildes) + return error; + } + +-int sys_sigaction(int sig, const struct old_sigaction *act, +- struct old_sigaction *oact) +-{ +- struct k_sigaction new_ka, old_ka; +- int ret; +- +- if (act) { +- old_sigset_t mask; +- if (verify_area(VERIFY_READ, act, sizeof(*act)) || +- __get_user(new_ka.sa.sa_handler, &act->sa_handler) || +- __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) +- return -EFAULT; +- __get_user(new_ka.sa.sa_flags, &act->sa_flags); +- __get_user(mask, &act->sa_mask); +- siginitset(&new_ka.sa.sa_mask, mask); +- } +- +- ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); +- +- if (!ret && oact) { +- if (verify_area(VERIFY_WRITE, oact, sizeof(*oact)) || +- __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || +- __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) +- return -EFAULT; +- __put_user(old_ka.sa.sa_flags, &oact->sa_flags); +- __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); +- } +- +- return ret; +-} +- + /* + * sys_ipc() is the de-multiplexer for the SysV IPC calls.. + * +@@ -253,7 +224,7 @@ int sys_ipc (uint call, int first, int s + return sys_shmctl (first, second, + (struct shmid_ds *) ptr); + default: +- return -EINVAL; ++ return -ENOSYS; + } + } + +@@ -302,11 +273,6 @@ int sys_olduname(struct oldold_utsname * + return error; + } + +-int sys_sigaltstack(const stack_t *uss, stack_t *uoss) +-{ +- return(do_sigaltstack(uss, uoss, PT_REGS_SP(¤t->thread.regs))); +-} +- + long execute_syscall(void *r) + { + return(CHOOSE_MODE_PROC(execute_syscall_tt, execute_syscall_skas, r)); +diff -puN arch/um/kernel/sysrq.c~uml-summa.diff arch/um/kernel/sysrq.c +--- limbo/arch/um/kernel/sysrq.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/sysrq.c Tue Oct 21 16:42:38 2003 +@@ -55,6 +55,14 @@ void show_trace_task(struct task_struct + show_trace((unsigned long *)esp); + } + ++void show_stack(struct task_struct *task, unsigned long *sp) ++{ ++ if(task) ++ show_trace_task(task); ++ else ++ show_trace(sp); ++} ++ + /* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically +diff -puN arch/um/kernel/time.c~uml-summa.diff arch/um/kernel/time.c +--- limbo/arch/um/kernel/time.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/time.c Tue Oct 21 16:42:38 2003 +@@ -16,12 +16,16 @@ + #include "process.h" + #include "signal_user.h" + #include "time_user.h" ++#include "kern_constants.h" + + extern struct timeval xtime; + ++struct timeval local_offset = { 0, 0 }; ++ + void timer(void) + { + gettimeofday(&xtime, NULL); ++ timeradd(&xtime, &local_offset, &xtime); + } + + void set_interval(int timer_type) +@@ -66,7 +70,7 @@ void switch_timers(int to_real) + errno); + } + +-void idle_timer(void) ++void uml_idle_timer(void) + { + if(signal(SIGVTALRM, SIG_IGN) == SIG_ERR) + panic("Couldn't unset SIGVTALRM handler"); +@@ -83,8 +87,6 @@ void time_init(void) + set_interval(ITIMER_VIRTUAL); + } + +-struct timeval local_offset = { 0, 0 }; +- + void do_gettimeofday(struct timeval *tv) + { + unsigned long flags; +@@ -103,7 +105,7 @@ int do_settimeofday(struct timespec *tv) + unsigned long flags; + struct timeval tv_in; + +- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) ++ if ((unsigned long) tv->tv_nsec >= UM_NSEC_PER_SEC) + return -EINVAL; + + tv_in.tv_sec = tv->tv_sec; +@@ -113,6 +115,8 @@ int do_settimeofday(struct timespec *tv) + gettimeofday(&now, NULL); + timersub(&tv_in, &now, &local_offset); + time_unlock(flags); ++ ++ return(0); + } + + EXPORT_SYMBOL(do_settimeofday); +diff -puN arch/um/kernel/time_kern.c~uml-summa.diff arch/um/kernel/time_kern.c +--- limbo/arch/um/kernel/time_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/time_kern.c Tue Oct 21 16:42:38 2003 +@@ -41,7 +41,7 @@ int __attribute__ ((__section__ (".unpro + + void timer_irq(union uml_pt_regs *regs) + { +- int cpu = current->thread_info->cpu, ticks = missed_ticks[cpu]; ++ int cpu = current_thread->cpu, ticks = missed_ticks[cpu]; + + if(!timer_irq_inited) return; + missed_ticks[cpu] = 0; +@@ -58,12 +58,13 @@ void boot_timer_handler(int sig) + do_timer(®s); + } + +-void um_timer(int irq, void *dev, struct pt_regs *regs) ++irqreturn_t um_timer(int irq, void *dev, struct pt_regs *regs) + { + do_timer(regs); +- write_seqlock(&xtime_lock); ++ write_seqlock_irq(&xtime_lock); + timer(); +- write_sequnlock(&xtime_lock); ++ write_sequnlock_irq(&xtime_lock); ++ return(IRQ_HANDLED); + } + + long um_time(int * tloc) +@@ -81,12 +82,12 @@ long um_time(int * tloc) + long um_stime(int * tptr) + { + int value; +- struct timeval new; ++ struct timespec new; + + if (get_user(value, tptr)) + return -EFAULT; + new.tv_sec = value; +- new.tv_usec = 0; ++ new.tv_nsec = 0; + do_settimeofday(&new); + return 0; + } +@@ -125,9 +126,11 @@ void __const_udelay(um_udelay_t usecs) + void timer_handler(int sig, union uml_pt_regs *regs) + { + #ifdef CONFIG_SMP ++ local_irq_disable(); + update_process_times(user_context(UPT_SP(regs))); ++ local_irq_enable(); + #endif +- if(current->thread_info->cpu == 0) ++ if(current_thread->cpu == 0) + timer_irq(regs); + } + +diff -puN arch/um/kernel/trap_kern.c~uml-summa.diff arch/um/kernel/trap_kern.c +--- limbo/arch/um/kernel/trap_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/trap_kern.c Tue Oct 21 16:42:38 2003 +@@ -16,6 +16,7 @@ + #include "asm/tlbflush.h" + #include "asm/a.out.h" + #include "asm/current.h" ++#include "asm/irq.h" + #include "user_util.h" + #include "kern_util.h" + #include "kern.h" +@@ -51,7 +52,7 @@ int handle_page_fault(unsigned long addr + if(is_write && !(vma->vm_flags & VM_WRITE)) + goto out; + page = address & PAGE_MASK; +- if(page == (unsigned long) current->thread_info + PAGE_SIZE) ++ if(page == (unsigned long) current_thread + PAGE_SIZE) + panic("Kernel stack overflow"); + pgd = pgd_offset(mm, page); + pmd = pmd_offset(pgd, page); +@@ -180,6 +181,11 @@ void bus_handler(int sig, union uml_pt_r + else relay_signal(sig, regs); + } + ++void winch(int sig, union uml_pt_regs *regs) ++{ ++ do_IRQ(WINCH_IRQ, regs); ++} ++ + void trap_init(void) + { + } +diff -puN arch/um/kernel/trap_user.c~uml-summa.diff arch/um/kernel/trap_user.c +--- limbo/arch/um/kernel/trap_user.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/trap_user.c Tue Oct 21 16:42:38 2003 +@@ -82,6 +82,8 @@ struct signal_info sig_info[] = { + .is_irq = 0 }, + [ SIGILL ] { .handler = relay_signal, + .is_irq = 0 }, ++ [ SIGWINCH ] { .handler = winch, ++ .is_irq = 1 }, + [ SIGBUS ] { .handler = bus_handler, + .is_irq = 0 }, + [ SIGSEGV] { .handler = segv_handler, +@@ -121,9 +123,9 @@ void alarm_handler(int sig, struct sigco + + void do_longjmp(void *b, int val) + { +- jmp_buf *buf = b; ++ sigjmp_buf *buf = b; + +- longjmp(*buf, val); ++ siglongjmp(*buf, val); + } + + /* +diff -puN arch/um/kernel/tt/exec_kern.c~uml-summa.diff arch/um/kernel/tt/exec_kern.c +--- limbo/arch/um/kernel/tt/exec_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/tt/exec_kern.c Tue Oct 21 16:42:38 2003 +@@ -47,17 +47,17 @@ void flush_thread_tt(void) + do_exit(SIGKILL); + } + +- if(current->thread_info->cpu == 0) ++ if(current_thread->cpu == 0) + forward_interrupts(new_pid); + current->thread.request.op = OP_EXEC; + current->thread.request.u.exec.pid = new_pid; +- unprotect_stack((unsigned long) current->thread_info); ++ unprotect_stack((unsigned long) current_thread); + os_usr1_process(os_getpid()); + + enable_timer(); + free_page(stack); + protect_memory(uml_reserved, high_physmem - uml_reserved, 1, 1, 0, 1); +- task_protections((unsigned long) current->thread_info); ++ task_protections((unsigned long) current_thread); + force_flush_all(); + unblock_signals(); + } +diff -puN arch/um/kernel/tt/include/uaccess.h~uml-summa.diff arch/um/kernel/tt/include/uaccess.h +--- limbo/arch/um/kernel/tt/include/uaccess.h~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/tt/include/uaccess.h Tue Oct 21 16:42:38 2003 +@@ -46,18 +46,20 @@ extern int __do_copy_from_user(void *to, + + static inline int copy_from_user_tt(void *to, const void *from, int n) + { +- return(access_ok_tt(VERIFY_READ, from, n) ? +- __do_copy_from_user(to, from, n, +- ¤t->thread.fault_addr, +- ¤t->thread.fault_catcher) : n); ++ if(!access_ok_tt(VERIFY_READ, from, n)) ++ return(n); ++ ++ return(__do_copy_from_user(to, from, n, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); + } + + static inline int copy_to_user_tt(void *to, const void *from, int n) + { +- return(access_ok_tt(VERIFY_WRITE, to, n) ? +- __do_copy_to_user(to, from, n, +- ¤t->thread.fault_addr, +- ¤t->thread.fault_catcher) : n); ++ if(!access_ok_tt(VERIFY_WRITE, to, n)) ++ return(n); ++ ++ return(__do_copy_to_user(to, from, n, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); + } + + extern int __do_strncpy_from_user(char *dst, const char *src, size_t n, +@@ -67,7 +69,9 @@ static inline int strncpy_from_user_tt(c + { + int n; + +- if(!access_ok_tt(VERIFY_READ, src, 1)) return(-EFAULT); ++ if(!access_ok_tt(VERIFY_READ, src, 1)) ++ return(-EFAULT); ++ + n = __do_strncpy_from_user(dst, src, count, + ¤t->thread.fault_addr, + ¤t->thread.fault_catcher); +@@ -87,10 +91,11 @@ static inline int __clear_user_tt(void * + + static inline int clear_user_tt(void *mem, int len) + { +- return(access_ok_tt(VERIFY_WRITE, mem, len) ? +- __do_clear_user(mem, len, +- ¤t->thread.fault_addr, +- ¤t->thread.fault_catcher) : len); ++ if(!access_ok_tt(VERIFY_WRITE, mem, len)) ++ return(len); ++ ++ return(__do_clear_user(mem, len, ¤t->thread.fault_addr, ++ ¤t->thread.fault_catcher)); + } + + extern int __do_strnlen_user(const char *str, unsigned long n, +diff -puN arch/um/kernel/tt/process_kern.c~uml-summa.diff arch/um/kernel/tt/process_kern.c +--- limbo/arch/um/kernel/tt/process_kern.c~uml-summa.diff Tue Oct 21 16:42:37 2003 ++++ limbo-god/arch/um/kernel/tt/process_kern.c Tue Oct 21 16:42:38 2003 +@@ -104,7 +104,10 @@ void *switch_to_tt(void *prev, void *nex + + void release_thread_tt(struct task_struct *task) + { +- os_kill_process(task->thread.mode.tt.extern_pid, 0); ++ int pid = task->thread.mode.tt.extern_pid; ++ ++ if(os_getpid() != pid) ++ os_kill_process(pid, 0); + } + + void exit_thread_tt(void) +@@ -125,27 +128,27 @@ static void new_thread_handler(int sig) + UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); + suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); + +- block_signals(); ++ force_flush_all(); ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); ++ current->thread.prev_sched = NULL; ++ + init_new_thread_signals(1); +-#ifdef CONFIG_SMP +- schedule_tail(current->thread.prev_sched); +-#endif + enable_timer(); + free_page(current->thread.temp_stack); + set_cmdline("(kernel thread)"); +- force_flush_all(); + +- current->thread.prev_sched = NULL; + change_sig(SIGUSR1, 1); + change_sig(SIGVTALRM, 1); + change_sig(SIGPROF, 1); +- unblock_signals(); ++ local_irq_enable(); + if(!run_kernel_thread(fn, arg, ¤t->thread.exec_buf)) + do_exit(0); + } + + static int new_thread_proc(void *stack) + { ++ local_irq_disable(); + init_new_thread_stack(stack, new_thread_handler); + os_usr1_process(os_getpid()); + return(0); +@@ -165,35 +168,32 @@ void finish_fork_handler(int sig) + UPT_SC(¤t->thread.regs.regs) = (void *) (&sig + 1); + suspend_new_thread(current->thread.mode.tt.switch_pipe[0]); + +-#ifdef CONFIG_SMP +- schedule_tail(NULL); +-#endif ++ force_flush_all(); ++ if(current->thread.prev_sched != NULL) ++ schedule_tail(current->thread.prev_sched); ++ current->thread.prev_sched = NULL; ++ + enable_timer(); + change_sig(SIGVTALRM, 1); + local_irq_enable(); +- force_flush_all(); + if(current->mm != current->parent->mm) + protect_memory(uml_reserved, high_physmem - uml_reserved, 1, + 1, 0, 1); +- task_protections((unsigned long) current->thread_info); +- +- current->thread.prev_sched = NULL; ++ task_protections((unsigned long) current_thread); + + free_page(current->thread.temp_stack); ++ local_irq_disable(); + change_sig(SIGUSR1, 0); + set_user_mode(current); + } + +-static int sigusr1 = SIGUSR1; +- + int fork_tramp(void *stack) + { +- int sig = sigusr1; +- + local_irq_disable(); ++ arch_init_thread(); + init_new_thread_stack(stack, finish_fork_handler); + +- kill(os_getpid(), sig); ++ os_usr1_process(os_getpid()); + return(0); + } + +@@ -377,8 +377,8 @@ static void mprotect_kernel_mem(int w) + + pages = (1 << CONFIG_KERNEL_STACK_ORDER); + +- start = (unsigned long) current->thread_info + PAGE_SIZE; +- end = (unsigned long) current + PAGE_SIZE * pages; ++ start = (unsigned long) current_thread + PAGE_SIZE; ++ end = (unsigned long) current_thread + PAGE_SIZE * pages; + protect_memory(uml_reserved, start - uml_reserved, 1, w, 1, 1); + protect_memory(end, high_physmem - end, 1, w, 1, 1); + +diff -puN arch/um/kernel/tt/ptproxy/proxy.c~uml-summa.diff arch/um/kernel/tt/ptproxy/proxy.c +--- limbo/arch/um/kernel/tt/ptproxy/proxy.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/tt/ptproxy/proxy.c Tue Oct 21 16:42:38 2003 +@@ -293,11 +293,10 @@ void fake_child_exit(void) + } + + char gdb_init_string[] = +-"att 1 +-b panic +-b stop +-handle SIGWINCH nostop noprint pass +-"; ++"att 1\n" ++"b panic\n" ++"b stop\n" ++"handle SIGWINCH nostop noprint pass\n"; + + int start_debugger(char *prog, int startup, int stop, int *fd_out) + { +diff -puN arch/um/kernel/tt/tlb.c~uml-summa.diff arch/um/kernel/tt/tlb.c +--- limbo/arch/um/kernel/tt/tlb.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/tt/tlb.c Tue Oct 21 16:42:38 2003 +@@ -10,6 +10,7 @@ + #include "asm/page.h" + #include "asm/pgtable.h" + #include "asm/uaccess.h" ++#include "asm/tlbflush.h" + #include "user_util.h" + #include "mem_user.h" + #include "os.h" +diff -puN arch/um/kernel/tt/tracer.c~uml-summa.diff arch/um/kernel/tt/tracer.c +--- limbo/arch/um/kernel/tt/tracer.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/tt/tracer.c Tue Oct 21 16:42:38 2003 +@@ -39,7 +39,7 @@ int is_tracer_winch(int pid, int fd, voi + return(0); + + register_winch_irq(tracer_winch[0], fd, -1, data); +- return(0); ++ return(1); + } + + static void tracer_winch_handler(int sig) +@@ -401,7 +401,7 @@ static int __init uml_debug_setup(char * + + if(!strcmp(line, "go")) debug_stop = 0; + else if(!strcmp(line, "parent")) debug_parent = 1; +- else printk("Unknown debug option : '%s'\n", line); ++ else printf("Unknown debug option : '%s'\n", line); + + line = next; + } +diff -puN arch/um/kernel/tt/uaccess_user.c~uml-summa.diff arch/um/kernel/tt/uaccess_user.c +--- limbo/arch/um/kernel/tt/uaccess_user.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/tt/uaccess_user.c Tue Oct 21 16:42:38 2003 +@@ -60,10 +60,10 @@ int __do_strnlen_user(const char *str, u + { + int ret; + unsigned long *faddrp = (unsigned long *)fault_addr; +- jmp_buf jbuf; ++ sigjmp_buf jbuf; + + *fault_catcher = &jbuf; +- if(setjmp(jbuf) == 0){ ++ if(sigsetjmp(jbuf,1) == 0){ + ret = strlen(str) + 1; + } + else { +diff -puN arch/um/kernel/tty_log.c~uml-summa.diff arch/um/kernel/tty_log.c +--- limbo/arch/um/kernel/tty_log.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/tty_log.c Tue Oct 21 16:42:38 2003 +@@ -13,6 +13,7 @@ + #include <sys/time.h> + #include "init.h" + #include "user.h" ++#include "kern_util.h" + #include "os.h" + + #define TTY_LOG_DIR "./" +@@ -24,29 +25,40 @@ static int tty_log_fd = -1; + #define TTY_LOG_OPEN 1 + #define TTY_LOG_CLOSE 2 + #define TTY_LOG_WRITE 3 ++#define TTY_LOG_EXEC 4 ++ ++#define TTY_READ 1 ++#define TTY_WRITE 2 + + struct tty_log_buf { + int what; + unsigned long tty; + int len; ++ int direction; ++ unsigned long sec; ++ unsigned long usec; + }; + +-int open_tty_log(void *tty) ++int open_tty_log(void *tty, void *current_tty) + { + struct timeval tv; + struct tty_log_buf data; + char buf[strlen(tty_log_dir) + sizeof("01234567890-01234567\0")]; + int fd; + ++ gettimeofday(&tv, NULL); + if(tty_log_fd != -1){ +- data = ((struct tty_log_buf) { what : TTY_LOG_OPEN, +- tty : (unsigned long) tty, +- len : 0 }); ++ data = ((struct tty_log_buf) { .what = TTY_LOG_OPEN, ++ .tty = (unsigned long) tty, ++ .len = sizeof(current_tty), ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); + write(tty_log_fd, &data, sizeof(data)); ++ write(tty_log_fd, ¤t_tty, data.len); + return(tty_log_fd); + } + +- gettimeofday(&tv, NULL); + sprintf(buf, "%s/%0u-%0u", tty_log_dir, (unsigned int) tv.tv_sec, + (unsigned int) tv.tv_usec); + +@@ -62,30 +74,114 @@ int open_tty_log(void *tty) + void close_tty_log(int fd, void *tty) + { + struct tty_log_buf data; ++ struct timeval tv; + + if(tty_log_fd != -1){ +- data = ((struct tty_log_buf) { what : TTY_LOG_CLOSE, +- tty : (unsigned long) tty, +- len : 0 }); ++ gettimeofday(&tv, NULL); ++ data = ((struct tty_log_buf) { .what = TTY_LOG_CLOSE, ++ .tty = (unsigned long) tty, ++ .len = 0, ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); + write(tty_log_fd, &data, sizeof(data)); + return; + } + close(fd); + } + +-int write_tty_log(int fd, char *buf, int len, void *tty) ++static int log_chunk(int fd, const char *buf, int len) + { ++ int total = 0, try, missed, n; ++ char chunk[64]; ++ ++ while(len > 0){ ++ try = (len > sizeof(chunk)) ? sizeof(chunk) : len; ++ missed = copy_from_user_proc(chunk, (char *) buf, try); ++ try -= missed; ++ n = write(fd, chunk, try); ++ if(n != try) ++ return(-errno); ++ if(missed != 0) ++ return(-EFAULT); ++ ++ len -= try; ++ total += try; ++ buf += try; ++ } ++ ++ return(total); ++} ++ ++int write_tty_log(int fd, const char *buf, int len, void *tty, int is_read) ++{ ++ struct timeval tv; + struct tty_log_buf data; ++ int direction; + + if(fd == tty_log_fd){ +- data = ((struct tty_log_buf) { what : TTY_LOG_WRITE, +- tty : (unsigned long) tty, +- len : len }); ++ gettimeofday(&tv, NULL); ++ direction = is_read ? TTY_READ : TTY_WRITE; ++ data = ((struct tty_log_buf) { .what = TTY_LOG_WRITE, ++ .tty = (unsigned long) tty, ++ .len = len, ++ .direction = direction, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); + write(tty_log_fd, &data, sizeof(data)); + } +- return(write(fd, buf, len)); ++ ++ return(log_chunk(fd, buf, len)); + } + ++void log_exec(char **argv, void *tty) ++{ ++ struct timeval tv; ++ struct tty_log_buf data; ++ char **ptr,*arg; ++ int len; ++ ++ if(tty_log_fd == -1) return; ++ ++ gettimeofday(&tv, NULL); ++ ++ len = 0; ++ for(ptr = argv; ; ptr++){ ++ if(copy_from_user_proc(&arg, ptr, sizeof(arg))) ++ return; ++ if(arg == NULL) break; ++ len += strlen_user_proc(arg); ++ } ++ ++ data = ((struct tty_log_buf) { .what = TTY_LOG_EXEC, ++ .tty = (unsigned long) tty, ++ .len = len, ++ .direction = 0, ++ .sec = tv.tv_sec, ++ .usec = tv.tv_usec } ); ++ write(tty_log_fd, &data, sizeof(data)); ++ ++ for(ptr = argv; ; ptr++){ ++ if(copy_from_user_proc(&arg, ptr, sizeof(arg))) ++ return; ++ if(arg == NULL) break; ++ log_chunk(tty_log_fd, arg, strlen_user_proc(arg)); ++ } ++} ++ ++extern void register_tty_logger(int (*opener)(void *, void *), ++ int (*writer)(int, const char *, int, ++ void *, int), ++ void (*closer)(int, void *)); ++ ++static int register_logger(void) ++{ ++ register_tty_logger(open_tty_log, write_tty_log, close_tty_log); ++ return(0); ++} ++ ++__uml_initcall(register_logger); ++ + static int __init set_tty_log_dir(char *name, int *add) + { + tty_log_dir = name; +@@ -104,7 +200,7 @@ static int __init set_tty_log_fd(char *n + + tty_log_fd = strtoul(name, &end, 0); + if((*end != '\0') || (end == name)){ +- printk("set_tty_log_fd - strtoul failed on '%s'\n", name); ++ printf("set_tty_log_fd - strtoul failed on '%s'\n", name); + tty_log_fd = -1; + } + return 0; +diff -puN arch/um/kernel/uaccess_user.c~uml-summa.diff arch/um/kernel/uaccess_user.c +--- limbo/arch/um/kernel/uaccess_user.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/uaccess_user.c Tue Oct 21 16:42:38 2003 +@@ -18,9 +18,9 @@ unsigned long __do_user_copy(void *to, c + { + unsigned long *faddrp = (unsigned long *) fault_addr, ret; + +- jmp_buf jbuf; ++ sigjmp_buf jbuf; + *fault_catcher = &jbuf; +- if(setjmp(jbuf) == 0){ ++ if(sigsetjmp(jbuf,1) == 0){ + (*op)(to, from, n); + ret = 0; + *faulted_out = 0; +diff -puN arch/um/kernel/um_arch.c~uml-summa.diff arch/um/kernel/um_arch.c +--- limbo/arch/um/kernel/um_arch.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/um_arch.c Tue Oct 21 16:42:38 2003 +@@ -38,13 +38,18 @@ + #include "mode_kern.h" + #include "mode.h" + +-#define DEFAULT_COMMAND_LINE "root=6200" ++#define DEFAULT_COMMAND_LINE "root=ubd0" + + struct cpuinfo_um boot_cpu_data = { + .loops_per_jiffy = 0, + .ipi_pipe = { -1, -1 } + }; + ++/* Placeholder to make UML link until the vsyscall stuff is actually ++ * implemented ++ */ ++void *__kernel_vsyscall; ++ + unsigned long thread_saved_pc(struct task_struct *task) + { + return(os_process_pc(CHOOSE_MODE_PROC(thread_pid_tt, thread_pid_skas, +@@ -61,10 +66,14 @@ static int show_cpuinfo(struct seq_file + return 0; + #endif + +- seq_printf(m, "bogomips\t: %lu.%02lu\n", ++ seq_printf(m, "processor\t: %d\n", index); ++ seq_printf(m, "vendor_id\t: User Mode Linux\n"); ++ seq_printf(m, "model name\t: UML\n"); ++ seq_printf(m, "mode\t\t: %s\n", CHOOSE_MODE("tt", "skas")); ++ seq_printf(m, "host\t\t: %s\n", host_info); ++ seq_printf(m, "bogomips\t: %lu.%02lu\n\n", + loops_per_jiffy/(500000/HZ), + (loops_per_jiffy/(5000/HZ)) % 100); +- seq_printf(m, "host\t\t: %s\n", host_info); + + return(0); + } +@@ -134,12 +143,12 @@ void set_cmdline(char *cmd) + if(umid != NULL){ + snprintf(argv1_begin, + (argv1_end - argv1_begin) * sizeof(*ptr), +- "(%s)", umid); ++ "(%s) ", umid); + ptr = &argv1_begin[strlen(argv1_begin)]; + } + else ptr = argv1_begin; + +- snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), " [%s]", cmd); ++ snprintf(ptr, (argv1_end - ptr) * sizeof(*ptr), "[%s]", cmd); + memset(argv1_begin + strlen(argv1_begin), '\0', + argv1_end - argv1_begin - strlen(argv1_begin)); + #endif +@@ -179,7 +188,7 @@ __uml_setup("root=", uml_root_setup, + static int __init uml_ncpus_setup(char *line, int *add) + { + if (!sscanf(line, "%d", &ncpus)) { +- printk("Couldn't parse [%s]\n", line); ++ printf("Couldn't parse [%s]\n", line); + return -1; + } + +@@ -210,7 +219,7 @@ static int __init mode_tt_setup(char *li + + static int __init mode_tt_setup(char *line, int *add) + { +- printk("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n"); ++ printf("CONFIG_MODE_TT disabled - 'mode=tt' ignored\n"); + return(0); + } + +@@ -221,7 +230,7 @@ static int __init mode_tt_setup(char *li + + static int __init mode_tt_setup(char *line, int *add) + { +- printk("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n"); ++ printf("CONFIG_MODE_SKAS disabled - 'mode=tt' redundant\n"); + return(0); + } + +@@ -369,6 +378,7 @@ int linux_main(int argc, char **argv) + 2 * PAGE_SIZE; + + task_protections((unsigned long) &init_thread_info); ++ os_flush_stdout(); + + return(CHOOSE_MODE(start_uml_tt(), start_uml_skas())); + } +diff -puN arch/um/kernel/umid.c~uml-summa.diff arch/um/kernel/umid.c +--- limbo/arch/um/kernel/umid.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/umid.c Tue Oct 21 16:42:38 2003 +@@ -33,18 +33,19 @@ static char *uml_dir = UML_DIR; + static int umid_is_random = 1; + static int umid_inited = 0; + +-static int make_umid(void); ++static int make_umid(int (*printer)(const char *fmt, ...)); + +-static int __init set_umid(char *name, int is_random) ++static int __init set_umid(char *name, int is_random, ++ int (*printer)(const char *fmt, ...)) + { + if(umid_inited){ +- printk("Unique machine name can't be set twice\n"); ++ (*printer)("Unique machine name can't be set twice\n"); + return(-1); + } + + if(strlen(name) > UMID_LEN - 1) +- printk("Unique machine name is being truncated to %s " +- "characters\n", UMID_LEN); ++ (*printer)("Unique machine name is being truncated to %s " ++ "characters\n", UMID_LEN); + strlcpy(umid, name, sizeof(umid)); + + umid_is_random = is_random; +@@ -54,7 +55,7 @@ static int __init set_umid(char *name, i + + static int __init set_umid_arg(char *name, int *add) + { +- return(set_umid(name, 0)); ++ return(set_umid(name, 0, printf)); + } + + __uml_setup("umid=", set_umid_arg, +@@ -67,7 +68,7 @@ int __init umid_file_name(char *name, ch + { + int n; + +- if(!umid_inited && make_umid()) return(-1); ++ if(!umid_inited && make_umid(printk)) return(-1); + + n = strlen(uml_dir) + strlen(umid) + strlen(name) + 1; + if(n > len){ +@@ -92,14 +93,14 @@ static int __init create_pid_file(void) + fd = os_open_file(file, of_create(of_excl(of_rdwr(OPENFLAGS()))), + 0644); + if(fd < 0){ +- printk("Open of machine pid file \"%s\" failed - " ++ printf("Open of machine pid file \"%s\" failed - " + "errno = %d\n", file, -fd); + return 0; + } + + sprintf(pid, "%d\n", os_getpid()); + if(write(fd, pid, strlen(pid)) != strlen(pid)) +- printk("Write of pid file failed - errno = %d\n", errno); ++ printf("Write of pid file failed - errno = %d\n", errno); + close(fd); + return 0; + } +@@ -197,7 +198,7 @@ static int __init set_uml_dir(char *name + if((strlen(name) > 0) && (name[strlen(name) - 1] != '/')){ + uml_dir = malloc(strlen(name) + 1); + if(uml_dir == NULL){ +- printk("Failed to malloc uml_dir - error = %d\n", ++ printf("Failed to malloc uml_dir - error = %d\n", + errno); + uml_dir = name; + return(0); +@@ -217,7 +218,7 @@ static int __init make_uml_dir(void) + char *home = getenv("HOME"); + + if(home == NULL){ +- printk("make_uml_dir : no value in environment for " ++ printf("make_uml_dir : no value in environment for " + "$HOME\n"); + exit(1); + } +@@ -239,25 +240,25 @@ static int __init make_uml_dir(void) + strcpy(uml_dir, dir); + + if((mkdir(uml_dir, 0777) < 0) && (errno != EEXIST)){ +- printk("Failed to mkdir %s - errno = %i\n", uml_dir, errno); ++ printf("Failed to mkdir %s - errno = %i\n", uml_dir, errno); + return(-1); + } + return 0; + } + +-static int __init make_umid(void) ++static int __init make_umid(int (*printer)(const char *fmt, ...)) + { + int fd, err; + char tmp[strlen(uml_dir) + UMID_LEN + 1]; + + strlcpy(tmp, uml_dir, sizeof(tmp)); + +- if(*umid == 0){ ++ if(!umid_inited){ + strcat(tmp, "XXXXXX"); + fd = mkstemp(tmp); + if(fd < 0){ +- printk("make_umid - mkstemp failed, errno = %d\n", +- errno); ++ (*printer)("make_umid - mkstemp failed, errno = %d\n", ++ errno); + return(1); + } + +@@ -267,7 +268,7 @@ static int __init make_umid(void) + * for directories. + */ + unlink(tmp); +- set_umid(&tmp[strlen(uml_dir)], 1); ++ set_umid(&tmp[strlen(uml_dir)], 1, printer); + } + + sprintf(tmp, "%s%s", uml_dir, umid); +@@ -275,14 +276,14 @@ static int __init make_umid(void) + if((err = mkdir(tmp, 0777)) < 0){ + if(errno == EEXIST){ + if(not_dead_yet(tmp)){ +- printk("umid '%s' is in use\n", umid); ++ (*printer)("umid '%s' is in use\n", umid); + return(-1); + } + err = mkdir(tmp, 0777); + } + } + if(err < 0){ +- printk("Failed to create %s - errno = %d\n", umid, errno); ++ (*printer)("Failed to create %s - errno = %d\n", umid, errno); + return(-1); + } + +@@ -295,7 +296,13 @@ __uml_setup("uml_dir=", set_uml_dir, + ); + + __uml_postsetup(make_uml_dir); +-__uml_postsetup(make_umid); ++ ++static int __init make_umid_setup(void) ++{ ++ return(make_umid(printf)); ++} ++ ++__uml_postsetup(make_umid_setup); + __uml_postsetup(create_pid_file); + + /* +diff -puN arch/um/kernel/user_util.c~uml-summa.diff arch/um/kernel/user_util.c +--- limbo/arch/um/kernel/user_util.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/kernel/user_util.c Tue Oct 21 16:42:38 2003 +@@ -119,17 +119,6 @@ int wait_for_stop(int pid, int sig, int + } + } + +-int clone_and_wait(int (*fn)(void *), void *arg, void *sp, int flags) +-{ +- int pid; +- +- pid = clone(fn, sp, flags, arg); +- if(pid < 0) return(-1); +- wait_for_stop(pid, SIGSTOP, PTRACE_CONT, NULL); +- ptrace(PTRACE_CONT, pid, 0, 0); +- return(pid); +-} +- + int raw(int fd, int complain) + { + struct termios tt; +diff -puN arch/um/os-Linux/drivers/tuntap_user.c~uml-summa.diff arch/um/os-Linux/drivers/tuntap_user.c +--- limbo/arch/um/os-Linux/drivers/tuntap_user.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/os-Linux/drivers/tuntap_user.c Tue Oct 21 16:42:38 2003 +@@ -142,7 +142,7 @@ static int tuntap_open(void *data) + return(-errno); + } + memset(&ifr, 0, sizeof(ifr)); +- ifr.ifr_flags = IFF_TAP; ++ ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + strlcpy(ifr.ifr_name, pri->dev_name, sizeof(ifr.ifr_name)); + if(ioctl(pri->fd, TUNSETIFF, (void *) &ifr) < 0){ + printk("TUNSETIFF failed, errno = %d", errno); +diff -puN arch/um/os-Linux/file.c~uml-summa.diff arch/um/os-Linux/file.c +--- limbo/arch/um/os-Linux/file.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/os-Linux/file.c Tue Oct 21 16:42:38 2003 +@@ -315,7 +315,7 @@ int os_rcv_fd(int fd, int *helper_pid_ou + return(new); + } + +-int create_unix_socket(char *file, int len) ++int create_unix_socket(char *file, int len, int close_on_exec) + { + struct sockaddr_un addr; + int sock, err; +@@ -327,6 +327,10 @@ int create_unix_socket(char *file, int l + return(-errno); + } + ++ if(close_on_exec && fcntl(sock, F_SETFD, 1) < 0) ++ printk("create_unix_socket : Setting FD_CLOEXEC failed, " ++ "errno = %d", errno); ++ + addr.sun_family = AF_UNIX; + + /* XXX Be more careful about overflow */ +@@ -342,6 +346,37 @@ int create_unix_socket(char *file, int l + return(sock); + } + ++void os_flush_stdout(void) ++{ ++ fflush(stdout); ++} ++ ++int os_lock_file(int fd, int excl) ++{ ++ int type = excl ? F_WRLCK : F_RDLCK; ++ struct flock lock = ((struct flock) { .l_type = type, ++ .l_whence = SEEK_SET, ++ .l_start = 0, ++ .l_len = 0 } ); ++ int err, save; ++ ++ err = fcntl(fd, F_SETLK, &lock); ++ if(!err) ++ goto out; ++ ++ save = -errno; ++ err = fcntl(fd, F_GETLK, &lock); ++ if(err){ ++ err = -errno; ++ goto out; ++ } ++ ++ printk("F_SETLK failed, file already locked by pid %d\n", lock.l_pid); ++ err = save; ++ out: ++ return(err); ++} ++ + /* + * Overrides for Emacs so that we follow Linus's tabbing style. + * Emacs will notice this stuff at the end of the file and automatically +diff -puN arch/um/sys-i386/Makefile~uml-summa.diff arch/um/sys-i386/Makefile +--- limbo/arch/um/sys-i386/Makefile~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/sys-i386/Makefile Tue Oct 21 16:42:38 2003 +@@ -1,7 +1,8 @@ +-obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o module.o \ +- ptrace.o ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o ++obj-y = bugs.o checksum.o extable.o fault.o ksyms.o ldt.o ptrace.o \ ++ ptrace_user.o semaphore.o sigcontext.o syscalls.o sysrq.o + + obj-$(CONFIG_HIGHMEM) += highmem.o ++obj-$(CONFIG_MODULES) += module.o + + USER_OBJS := bugs.o ptrace_user.o sigcontext.o fault.o + USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) +@@ -9,6 +10,8 @@ USER_OBJS := $(foreach file,$(USER_OBJS) + SYMLINKS = semaphore.c highmem.c module.c + SYMLINKS := $(foreach f,$(SYMLINKS),$(src)/$f) + ++clean-files := $(SYMLINKS) ++ + semaphore.c-dir = kernel + highmem.c-dir = mm + module.c-dir = kernel +@@ -24,8 +27,7 @@ $(USER_OBJS) : %.o: %.c + $(SYMLINKS): + $(call make_link,$@) + +-clean: +- $(MAKE) -C util clean ++subdir- := util + + fastdep: + +diff -puN arch/um/sys-i386/bugs.c~uml-summa.diff arch/um/sys-i386/bugs.c +--- limbo/arch/um/sys-i386/bugs.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/sys-i386/bugs.c Tue Oct 21 16:42:38 2003 +@@ -8,6 +8,7 @@ + #include <errno.h> + #include <string.h> + #include <sys/signal.h> ++#include <asm/ldt.h> + #include "kern_util.h" + #include "user.h" + #include "sysdep/ptrace.h" +@@ -16,8 +17,8 @@ + #define MAXTOKEN 64 + + /* Set during early boot */ +-int cpu_has_cmov = 1; +-int cpu_has_xmm = 0; ++int host_has_cmov = 1; ++int host_has_xmm = 0; + + static char token(int fd, char *buf, int len, char stop) + { +@@ -104,6 +105,25 @@ static int check_cpu_feature(char *featu + return(1); + } + ++static void disable_lcall(void) ++{ ++ struct modify_ldt_ldt_s ldt; ++ int err; ++ ++ bzero(&ldt, sizeof(ldt)); ++ ldt.entry_number = 7; ++ ldt.base_addr = 0; ++ ldt.limit = 0; ++ err = modify_ldt(1, &ldt, sizeof(ldt)); ++ if(err) ++ printk("Failed to disable lcall7 - errno = %d\n", errno); ++} ++ ++void arch_init_thread(void) ++{ ++ disable_lcall(); ++} ++ + void arch_check_bugs(void) + { + int have_it; +@@ -113,8 +133,8 @@ void arch_check_bugs(void) + "checks\n"); + return; + } +- if(check_cpu_feature("cmov", &have_it)) cpu_has_cmov = have_it; +- if(check_cpu_feature("xmm", &have_it)) cpu_has_xmm = have_it; ++ if(check_cpu_feature("cmov", &have_it)) host_has_cmov = have_it; ++ if(check_cpu_feature("xmm", &have_it)) host_has_xmm = have_it; + } + + int arch_handle_signal(int sig, union uml_pt_regs *regs) +@@ -130,18 +150,18 @@ int arch_handle_signal(int sig, union um + if((*((char *) ip) != 0x0f) || ((*((char *) (ip + 1)) & 0xf0) != 0x40)) + return(0); + +- if(cpu_has_cmov == 0) ++ if(host_has_cmov == 0) + panic("SIGILL caused by cmov, which this processor doesn't " + "implement, boot a filesystem compiled for older " + "processors"); +- else if(cpu_has_cmov == 1) ++ else if(host_has_cmov == 1) + panic("SIGILL caused by cmov, which this processor claims to " + "implement"); +- else if(cpu_has_cmov == -1) ++ else if(host_has_cmov == -1) + panic("SIGILL caused by cmov, couldn't tell if this processor " + "implements it, boot a filesystem compiled for older " + "processors"); +- else panic("Bad value for cpu_has_cmov (%d)", cpu_has_cmov); ++ else panic("Bad value for host_has_cmov (%d)", host_has_cmov); + return(0); + } + +diff -puN arch/um/sys-i386/fault.c~uml-summa.diff arch/um/sys-i386/fault.c +--- limbo/arch/um/sys-i386/fault.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/sys-i386/fault.c Tue Oct 21 16:42:38 2003 +@@ -7,14 +7,24 @@ + #include "sysdep/ptrace.h" + #include "sysdep/sigcontext.h" + +-extern unsigned long search_exception_table(unsigned long addr); ++struct exception_table_entry ++{ ++ unsigned long insn; ++ unsigned long fixup; ++}; ++const struct exception_table_entry *search_exception_tables(unsigned long add); + + int arch_fixup(unsigned long address, void *sc_ptr) + { + struct sigcontext *sc = sc_ptr; +- unsigned long fixup; ++ long fixup; ++ const struct exception_table_entry *ete; + +- fixup = search_exception_tables(address); ++ ete = search_exception_tables(address); ++ if (!ete) ++ return 0; ++ ++ fixup = ete->fixup; + if(fixup != 0){ + sc->eip = fixup; + return(1); +diff -puN arch/um/uml.lds.S~uml-summa.diff arch/um/uml.lds.S +--- limbo/arch/um/uml.lds.S~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/uml.lds.S Tue Oct 21 16:42:38 2003 +@@ -26,7 +26,11 @@ SECTIONS + . = ALIGN(4096); /* Init code and data */ + _stext = .; + __init_begin = .; +- .text.init : { *(.text.init) } ++ .init.text : { ++ _sinittext = .; ++ *(.init.text) ++ _einittext = .; ++ } + . = ALIGN(4096); + .text : + { +@@ -38,7 +42,7 @@ SECTIONS + + #include "asm/common.lds.S" + +- .data.init : { *(.data.init) } ++ init.data : { *(init.data) } + .data : + { + . = ALIGN(KERNEL_STACK_SIZE); /* init_task */ +diff -puN arch/um/util/mk_constants_kern.c~uml-summa.diff arch/um/util/mk_constants_kern.c +--- limbo/arch/um/util/mk_constants_kern.c~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/arch/um/util/mk_constants_kern.c Tue Oct 21 16:42:38 2003 +@@ -1,5 +1,6 @@ + #include "linux/kernel.h" + #include "linux/stringify.h" ++#include "linux/time.h" + #include "asm/page.h" + + extern void print_head(void); +@@ -11,6 +12,7 @@ int main(int argc, char **argv) + { + print_head(); + print_constant_int("UM_KERN_PAGE_SIZE", PAGE_SIZE); ++ + print_constant_str("UM_KERN_EMERG", KERN_EMERG); + print_constant_str("UM_KERN_ALERT", KERN_ALERT); + print_constant_str("UM_KERN_CRIT", KERN_CRIT); +@@ -19,6 +21,8 @@ int main(int argc, char **argv) + print_constant_str("UM_KERN_NOTICE", KERN_NOTICE); + print_constant_str("UM_KERN_INFO", KERN_INFO); + print_constant_str("UM_KERN_DEBUG", KERN_DEBUG); ++ ++ print_constant_int("UM_NSEC_PER_SEC", NSEC_PER_SEC); + print_tail(); + return(0); + } +diff -puN fs/Makefile~uml-summa.diff fs/Makefile +--- limbo/fs/Makefile~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/fs/Makefile Tue Oct 21 16:42:38 2003 +@@ -91,3 +91,5 @@ obj-$(CONFIG_JFS_FS) += jfs/ + obj-$(CONFIG_XFS_FS) += xfs/ + obj-$(CONFIG_AFS_FS) += afs/ + obj-$(CONFIG_BEFS_FS) += befs/ ++obj-$(CONFIG_HOSTFS) += hostfs/ ++obj-$(CONFIG_HPPFS) += hppfs/ +diff -puN include/asm-um/archparam-i386.h~uml-summa.diff include/asm-um/archparam-i386.h +--- limbo/include/asm-um/archparam-i386.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/archparam-i386.h Tue Oct 21 16:42:38 2003 +@@ -56,6 +56,65 @@ typedef elf_greg_t elf_gregset_t[ELF_NGR + pr_reg[16] = PT_REGS_SS(regs); \ + } while(0); + ++#define VSYSCALL_BASE (__fix_to_virt(FIX_VSYSCALL)) ++#define VSYSCALL_EHDR ((const struct elfhdr *) VSYSCALL_BASE) ++#define VSYSCALL_ENTRY ((unsigned long) &__kernel_vsyscall) ++extern void *__kernel_vsyscall; ++ ++/* ++ * Architecture-neutral AT_ values in 0-17, leave some room ++ * for more of them, start the x86-specific ones at 32. ++ */ ++#define AT_SYSINFO 32 ++#define AT_SYSINFO_EHDR 33 ++ ++#define ARCH_DLINFO \ ++do { \ ++ NEW_AUX_ENT(AT_SYSINFO, VSYSCALL_ENTRY); \ ++ NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL_BASE); \ ++} while (0) ++ ++/* ++ * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out ++ * extra segments containing the vsyscall DSO contents. Dumping its ++ * contents makes post-mortem fully interpretable later without matching up ++ * the same kernel and hardware config to see what PC values meant. ++ * Dumping its extra ELF program headers includes all the other information ++ * a debugger needs to easily find how the vsyscall DSO was being used. ++ */ ++#define ELF_CORE_EXTRA_PHDRS (VSYSCALL_EHDR->e_phnum) ++#define ELF_CORE_WRITE_EXTRA_PHDRS \ ++do { \ ++ const struct elf_phdr *const vsyscall_phdrs = \ ++ (const struct elf_phdr *) (VSYSCALL_BASE \ ++ + VSYSCALL_EHDR->e_phoff); \ ++ int i; \ ++ Elf32_Off ofs = 0; \ ++ for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ ++ struct elf_phdr phdr = vsyscall_phdrs[i]; \ ++ if (phdr.p_type == PT_LOAD) { \ ++ ofs = phdr.p_offset = offset; \ ++ offset += phdr.p_filesz; \ ++ } \ ++ else \ ++ phdr.p_offset += ofs; \ ++ phdr.p_paddr = 0; /* match other core phdrs */ \ ++ DUMP_WRITE(&phdr, sizeof(phdr)); \ ++ } \ ++} while (0) ++#define ELF_CORE_WRITE_EXTRA_DATA \ ++do { \ ++ const struct elf_phdr *const vsyscall_phdrs = \ ++ (const struct elf_phdr *) (VSYSCALL_BASE \ ++ + VSYSCALL_EHDR->e_phoff); \ ++ int i; \ ++ for (i = 0; i < VSYSCALL_EHDR->e_phnum; ++i) { \ ++ if (vsyscall_phdrs[i].p_type == PT_LOAD) \ ++ DUMP_WRITE((void *) vsyscall_phdrs[i].p_vaddr, \ ++ vsyscall_phdrs[i].p_filesz); \ ++ } \ ++} while (0) ++ + /********* Bits for asm-um/delay.h **********/ + + typedef unsigned long um_udelay_t; +diff -puN include/asm-um/bug.h~uml-summa.diff include/asm-um/bug.h +--- limbo/include/asm-um/bug.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/bug.h Tue Oct 21 16:42:38 2003 +@@ -1,30 +1,19 @@ + #ifndef __UM_BUG_H + #define __UM_BUG_H + +-#ifndef __ASSEMBLY__ ++#include "arch/bug.h" ++ ++#undef BUG ++#undef PAGE_BUG + + #define BUG() do { \ +- panic("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ ++ dump_stack(); \ ++ panic("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ + } while (0) + +-#define BUG_ON(condition) do { \ +- if (unlikely((condition)!=0)) \ +- BUG(); \ +-} while(0) +- + #define PAGE_BUG(page) do { \ +- BUG(); \ +-} while (0) +- +-#define WARN_ON(condition) do { \ +- if (unlikely((condition)!=0)) { \ +- printk("Badness in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \ +- dump_stack(); \ +- } \ ++ BUG(); \ + } while (0) + +-extern int foo; +- +-#endif + + #endif +diff -puN include/asm-um/common.lds.S~uml-summa.diff include/asm-um/common.lds.S +--- limbo/include/asm-um/common.lds.S~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/common.lds.S Tue Oct 21 16:42:38 2003 +@@ -1,3 +1,5 @@ ++#include <asm-generic/vmlinux.lds.h> ++ + .fini : { *(.fini) } =0x9090 + _etext = .; + PROVIDE (etext = .); +@@ -13,14 +15,6 @@ + + RODATA + +- __start___ksymtab = .; /* Kernel symbol table */ +- __ksymtab : { *(__ksymtab) } +- __stop___ksymtab = .; +- +- __start___gpl_ksymtab = .; /* Kernel symbol table: GPL-only symbols */ +- __gpl_ksymtab : { *(__gpl_ksymtab) } +- __stop___gpl_ksymtab = .; +- + __start___kallsyms = .; /* All kernel symbols */ + __kallsyms : { *(__kallsyms) } + __stop___kallsyms = .; +@@ -67,6 +61,12 @@ + } + __initcall_end = .; + ++ __con_initcall_start = .; ++ .con_initcall.init : { *(.con_initcall.init) } ++ __con_initcall_end = .; ++ ++ SECURITY_INIT ++ + __uml_initcall_start = .; + .uml.initcall.init : { *(.uml.initcall.init) } + __uml_initcall_end = .; +@@ -80,7 +80,33 @@ + .uml.exitcall : { *(.uml.exitcall.exit) } + __uml_exitcall_end = .; + +- . = ALIGN(4096); ++ . = ALIGN(4); ++ __alt_instructions = .; ++ .altinstructions : { *(.altinstructions) } ++ __alt_instructions_end = .; ++ .altinstr_replacement : { *(.altinstr_replacement) } ++ /* .exit.text is discard at runtime, not link time, to deal with references ++ from .altinstructions and .eh_frame */ ++ .exit.text : { *(.exit.text) } ++ .exit.data : { *(.exit.data) } ++ ++ __preinit_array_start = .; ++ .preinit_array : { *(.preinit_array) } ++ __preinit_array_end = .; ++ __init_array_start = .; ++ .init_array : { *(.init_array) } ++ __init_array_end = .; ++ __fini_array_start = .; ++ .fini_array : { *(.fini_array) } ++ __fini_array_end = .; ++ ++ . = ALIGN(4096); + __initramfs_start = .; + .init.ramfs : { *(.init.ramfs) } + __initramfs_end = .; ++ ++ /* Sections to be discarded */ ++ /DISCARD/ : { ++ *(.exitcall.exit) ++ } ++ +diff -puN include/asm-um/current.h~uml-summa.diff include/asm-um/current.h +--- limbo/include/asm-um/current.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/current.h Tue Oct 21 16:42:38 2003 +@@ -16,11 +16,15 @@ struct thread_info; + #define CURRENT_THREAD(dummy) (((unsigned long) &dummy) & \ + (PAGE_MASK << CONFIG_KERNEL_STACK_ORDER)) + +-#define current ({ int dummy; \ +- ((struct thread_info *) CURRENT_THREAD(dummy))->task; }) ++#define current_thread \ ++ ({ int dummy; ((struct thread_info *) CURRENT_THREAD(dummy)); }) ++ ++#define current (current_thread->task) + + #endif /* __ASSEMBLY__ */ + ++extern void *get_current(void); ++ + #endif + + /* +diff -puN include/asm-um/fixmap.h~uml-summa.diff include/asm-um/fixmap.h +--- limbo/include/asm-um/fixmap.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/fixmap.h Tue Oct 21 16:42:38 2003 +@@ -3,6 +3,7 @@ + + #include <linux/config.h> + #include <asm/kmap_types.h> ++#include <asm/bug.h> + + /* + * Here we define all the compile-time 'special' virtual +@@ -34,6 +35,7 @@ enum fixed_addresses { + FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ + FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + #endif ++ FIX_VSYSCALL, + __end_of_fixed_addresses + }; + +@@ -63,6 +65,13 @@ extern unsigned long get_kmem_end(void); + #define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) + #define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT) + ++/* ++ * This is the range that is readable by user mode, and things ++ * acting like user mode such as get_user_pages. ++ */ ++#define FIXADDR_USER_START (__fix_to_virt(FIX_VSYSCALL)) ++#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE) ++ + extern void __this_fixmap_does_not_exist(void); + + /* +diff -puN include/asm-um/irq.h~uml-summa.diff include/asm-um/irq.h +--- limbo/include/asm-um/irq.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/irq.h Tue Oct 21 16:42:38 2003 +@@ -1,15 +1,6 @@ + #ifndef __UM_IRQ_H + #define __UM_IRQ_H + +-/* The i386 irq.h has a struct task_struct in a prototype without including +- * sched.h. This forward declaration kills the resulting warning. +- */ +-struct task_struct; +- +-#include "asm/ptrace.h" +- +-#undef NR_IRQS +- + #define TIMER_IRQ 0 + #define UMN_IRQ 1 + #define CONSOLE_IRQ 2 +@@ -28,8 +19,4 @@ struct task_struct; + #define LAST_IRQ XTERM_IRQ + #define NR_IRQS (LAST_IRQ + 1) + +-extern int um_request_irq(unsigned int irq, int fd, int type, +- void (*handler)(int, void *, struct pt_regs *), +- unsigned long irqflags, const char * devname, +- void *dev_id); + #endif +diff -puN include/asm-um/page.h~uml-summa.diff include/asm-um/page.h +--- limbo/include/asm-um/page.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/page.h Tue Oct 21 16:42:38 2003 +@@ -4,7 +4,6 @@ + struct page; + + #include "asm/arch/page.h" +-#include "asm/bug.h" + + #undef __pa + #undef __va +diff -puN include/asm-um/pgtable.h~uml-summa.diff include/asm-um/pgtable.h +--- limbo/include/asm-um/pgtable.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/pgtable.h Tue Oct 21 16:42:38 2003 +@@ -78,12 +78,13 @@ extern unsigned long high_physmem; + + #define _PAGE_PRESENT 0x001 + #define _PAGE_NEWPAGE 0x002 +-#define _PAGE_PROTNONE 0x004 /* If not present */ +-#define _PAGE_RW 0x008 +-#define _PAGE_USER 0x010 +-#define _PAGE_ACCESSED 0x020 +-#define _PAGE_DIRTY 0x040 +-#define _PAGE_NEWPROT 0x080 ++#define _PAGE_NEWPROT 0x004 ++#define _PAGE_FILE 0x008 /* set:pagecache unset:swap */ ++#define _PAGE_PROTNONE 0x010 /* If not present */ ++#define _PAGE_RW 0x020 ++#define _PAGE_USER 0x040 ++#define _PAGE_ACCESSED 0x080 ++#define _PAGE_DIRTY 0x100 + + #define REGION_MASK 0xf0000000 + #define REGION_SHIFT 28 +@@ -202,6 +203,16 @@ extern unsigned long pfn_to_phys(unsigne + #define pfn_pte(pfn, prot) __pte(pfn_to_phys(pfn) | pgprot_val(prot)) + #define pfn_pmd(pfn, prot) __pmd(pfn_to_phys(pfn) | pgprot_val(prot)) + ++/* ++ * Bits 0 through 3 are taken ++ */ ++#define PTE_FILE_MAX_BITS 28 ++ ++#define pte_to_pgoff(pte) ((pte).pte_low >> 4) ++ ++#define pgoff_to_pte(off) \ ++ ((pte_t) { ((off) << 4) + _PAGE_FILE }) ++ + static inline pte_t pte_mknewprot(pte_t pte) + { + pte_val(pte) |= _PAGE_NEWPROT; +@@ -235,6 +246,12 @@ static inline void set_pte(pte_t *pteptr + * The following only work if pte_present() is true. + * Undefined behaviour if not.. + */ ++static inline int pte_user(pte_t pte) ++{ ++ return((pte_val(pte) & _PAGE_USER) && ++ !(pte_val(pte) & _PAGE_PROTNONE)); ++} ++ + static inline int pte_read(pte_t pte) + { + return((pte_val(pte) & _PAGE_USER) && +@@ -252,6 +269,14 @@ static inline int pte_write(pte_t pte) + !(pte_val(pte) & _PAGE_PROTNONE)); + } + ++/* ++ * The following only works if pte_present() is not true. ++ */ ++static inline int pte_file(pte_t pte) ++{ ++ return (pte).pte_low & _PAGE_FILE; ++} ++ + static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } + static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; } + static inline int pte_newpage(pte_t pte) { return pte_val(pte) & _PAGE_NEWPAGE; } +@@ -354,14 +379,26 @@ static inline pte_t pte_modify(pte_t pte + #define pmd_page(pmd) (phys_mem_map(pmd_val(pmd) & PAGE_MASK) + \ + ((phys_addr(pmd_val(pmd)) >> PAGE_SHIFT))) + +-/* to find an entry in a page-table-directory. */ ++/* ++ * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD] ++ * ++ * this macro returns the index of the entry in the pgd page which would ++ * control the given virtual address ++ */ + #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) + +-/* to find an entry in a page-table-directory */ ++/* ++ * pgd_offset() returns a (pgd_t *) ++ * pgd_index() is used get the offset into the pgd page's array of pgd_t's; ++ */ + #define pgd_offset(mm, address) \ + ((mm)->pgd + ((address) >> PGDIR_SHIFT)) + +-/* to find an entry in a kernel page-table-directory */ ++ ++/* ++ * a shortcut which implies the use of the kernel's pgd, instead ++ * of a process's ++ */ + #define pgd_offset_k(address) pgd_offset(&init_mm, address) + + #define pmd_index(address) \ +@@ -373,7 +410,12 @@ static inline pmd_t * pmd_offset(pgd_t * + return (pmd_t *) dir; + } + +-/* Find an entry in the third-level page table.. */ ++/* ++ * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE] ++ * ++ * this macro returns the index of the entry in the pte page which would ++ * control the given virtual address ++ */ + #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + #define pte_offset_kernel(dir, address) \ + ((pte_t *) pmd_page_kernel(*(dir)) + pte_index(address)) +@@ -399,11 +441,11 @@ typedef pte_t *pte_addr_t; + #define update_mmu_cache(vma,address,pte) do ; while (0) + + /* Encode and de-code a swap entry */ +-#define __swp_type(x) (((x).val >> 3) & 0x7f) +-#define __swp_offset(x) ((x).val >> 10) ++#define __swp_type(x) (((x).val >> 4) & 0x3f) ++#define __swp_offset(x) ((x).val >> 11) + + #define __swp_entry(type, offset) \ +- ((swp_entry_t) { ((type) << 3) | ((offset) << 10) }) ++ ((swp_entry_t) { ((type) << 4) | ((offset) << 11) }) + #define __pte_to_swp_entry(pte) \ + ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) + #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +diff -puN include/asm-um/processor-generic.h~uml-summa.diff include/asm-um/processor-generic.h +--- limbo/include/asm-um/processor-generic.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/processor-generic.h Tue Oct 21 16:42:38 2003 +@@ -11,9 +11,7 @@ struct pt_regs; + struct task_struct; + + #include "linux/config.h" +-#include "linux/signal.h" + #include "asm/ptrace.h" +-#include "asm/siginfo.h" + #include "choose-mode.h" + + struct mm_struct; +@@ -101,14 +99,19 @@ typedef struct { + } mm_segment_t; + + extern struct task_struct *alloc_task_struct(void); +-extern void free_task_struct(struct task_struct *task); + + extern void release_thread(struct task_struct *); + extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); + extern void dump_thread(struct pt_regs *regs, struct user *u); ++extern void prepare_to_copy(struct task_struct *tsk); + + extern unsigned long thread_saved_pc(struct task_struct *t); + ++static inline void mm_copy_segments(struct mm_struct *from_mm, ++ struct mm_struct *new_mm) ++{ ++} ++ + #define init_stack (init_thread_union.stack) + + /* +diff -puN include/asm-um/processor-i386.h~uml-summa.diff include/asm-um/processor-i386.h +--- limbo/include/asm-um/processor-i386.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/processor-i386.h Tue Oct 21 16:42:38 2003 +@@ -6,8 +6,8 @@ + #ifndef __UM_PROCESSOR_I386_H + #define __UM_PROCESSOR_I386_H + +-extern int cpu_has_xmm; +-extern int cpu_has_cmov; ++extern int host_has_xmm; ++extern int host_has_cmov; + + struct arch_thread { + unsigned long debugregs[8]; +diff -puN include/asm-um/smp.h~uml-summa.diff include/asm-um/smp.h +--- limbo/include/asm-um/smp.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/smp.h Tue Oct 21 16:42:38 2003 +@@ -10,7 +10,7 @@ + + extern cpumask_t cpu_online_map; + +-#define smp_processor_id() (current->thread_info->cpu) ++#define smp_processor_id() (current_thread->cpu) + #define cpu_logical_map(n) (n) + #define cpu_number_map(n) (n) + #define PROC_CHANGE_PENALTY 15 /* Pick a number, any number */ +@@ -26,6 +26,13 @@ extern inline void smp_cpus_done(unsigne + { + } + ++extern inline int any_online_cpu(unsigned int mask) ++{ ++ if (mask & cpu_online_map) ++ return __ffs(mask & cpu_online_map); ++ ++ return -1; ++} + #endif + + #endif +diff -puN include/asm-um/system-generic.h~uml-summa.diff include/asm-um/system-generic.h +--- limbo/include/asm-um/system-generic.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/system-generic.h Tue Oct 21 16:42:38 2003 +@@ -23,8 +23,10 @@ extern int get_signals(void); + extern void block_signals(void); + extern void unblock_signals(void); + +-#define local_save_flags(flags) do { (flags) = get_signals(); } while(0) +-#define local_irq_restore(flags) do { set_signals(flags); } while(0) ++#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ ++ (flags) = get_signals(); } while(0) ++#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ ++ set_signals(flags); } while(0) + + #define local_irq_save(flags) do { local_save_flags(flags); \ + local_irq_disable(); } while(0) +@@ -39,4 +41,7 @@ extern void unblock_signals(void); + (flags == 0); \ + }) + ++extern void *_switch_to(void *prev, void *next, void *last); ++#define switch_to(prev, next, last) prev = _switch_to(prev, next, last) ++ + #endif +diff -puN include/asm-um/thread_info.h~uml-summa.diff include/asm-um/thread_info.h +--- limbo/include/asm-um/thread_info.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/thread_info.h Tue Oct 21 16:42:38 2003 +@@ -9,6 +9,7 @@ + #ifndef __ASSEMBLY__ + + #include <asm/processor.h> ++#include <asm/types.h> + + struct thread_info { + struct task_struct *task; /* main task structure */ +@@ -20,6 +21,7 @@ struct thread_info { + mm_segment_t addr_limit; /* thread address space: + 0-0xBFFFFFFF for user + 0-0xFFFFFFFF for kernel */ ++ + struct restart_block restart_block; + }; + +@@ -43,15 +45,18 @@ struct thread_info { + static inline struct thread_info *current_thread_info(void) + { + struct thread_info *ti; +- __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~16383UL)); ++ unsigned long mask = PAGE_SIZE * ++ (1 << CONFIG_KERNEL_STACK_ORDER) - 1; ++ __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~mask)); + return ti; + } + + /* thread information allocation */ +-#define THREAD_SIZE (4*PAGE_SIZE) +-#define alloc_thread_info(tsk) ((struct thread_info *) \ +- __get_free_pages(GFP_KERNEL,2)) +-#define free_thread_info(ti) free_pages((unsigned long) (ti), 2) ++#define THREAD_SIZE ((1 << CONFIG_KERNEL_STACK_ORDER) * PAGE_SIZE) ++#define alloc_thread_info(tsk) \ ++ ((struct thread_info *) kmalloc(THREAD_SIZE, GFP_KERNEL)) ++#define free_thread_info(ti) kfree(ti) ++ + #define get_thread_info(ti) get_task_struct((ti)->task) + #define put_thread_info(ti) put_task_struct((ti)->task) + +@@ -65,11 +70,13 @@ static inline struct thread_info *curren + #define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling + * TIF_NEED_RESCHED + */ ++#define TIF_RESTART_BLOCK 4 + + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) + #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) ++#define _TIF_RESTART_BLOCK (1 << TIF_RESTART_BLOCK) + + #endif + +diff -puN include/asm-um/timex.h~uml-summa.diff include/asm-um/timex.h +--- limbo/include/asm-um/timex.h~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/include/asm-um/timex.h Tue Oct 21 16:42:38 2003 +@@ -1,8 +1,6 @@ + #ifndef __UM_TIMEX_H + #define __UM_TIMEX_H + +-#include "linux/time.h" +- + typedef unsigned long cycles_t; + + #define cacheflush_time (0) +diff -puN mm/Makefile~uml-summa.diff mm/Makefile +--- limbo/mm/Makefile~uml-summa.diff Tue Oct 21 16:42:38 2003 ++++ limbo-god/mm/Makefile Tue Oct 21 16:42:38 2003 +@@ -12,3 +12,4 @@ obj-y := bootmem.o filemap.o mempool.o + slab.o swap.o truncate.o vmscan.o $(mmu-y) + + obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o ++obj-$(CONFIG_PROC_MM) += proc_mm.o +diff -puN /dev/null fs/hostfs/Makefile +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/fs/hostfs/Makefile Tue Oct 21 16:42:38 2003 +@@ -0,0 +1,36 @@ ++# ++# Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++# struct stat64 changed the inode field name between 2.2 and 2.4 from st_ino ++# to __st_ino. It stayed in the same place, so as long as the correct name ++# is used, hostfs compiled on 2.2 should work on 2.4 and vice versa. ++ ++STAT64_INO_FIELD := $(shell grep -q __st_ino /usr/include/bits/stat.h && \ ++ echo __)st_ino ++ ++hostfs-objs := hostfs_kern.o hostfs_user.o ++ ++obj-y = ++obj-$(CONFIG_HOSTFS) += hostfs.o ++ ++SINGLE_OBJS = $(foreach f,$(patsubst %.o,%,$(obj-y) $(obj-m)),$($(f)-objs)) ++ ++USER_OBJS := $(filter %_user.o,$(obj-y) $(obj-m) $(SINGLE_OBJS)) ++USER_OBJS := $(foreach file,$(USER_OBJS),$(obj)/$(file)) ++ ++USER_CFLAGS += -DSTAT64_INO_FIELD=$(STAT64_INO_FIELD) ++ ++$(USER_OBJS) : %.o: %.c ++ $(CC) $(CFLAGS_$(notdir $@)) $(USER_CFLAGS) -c -o $@ $< ++ ++clean: ++ ++modules: ++ ++fastdep: ++ ++dep: ++ ++archmrproper: clean +diff -puN /dev/null fs/hostfs/hostfs.h +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/fs/hostfs/hostfs.h Tue Oct 21 16:42:38 2003 +@@ -0,0 +1,79 @@ ++#ifndef __UM_FS_HOSTFS ++#define __UM_FS_HOSTFS ++ ++#include "os.h" ++ ++/* These are exactly the same definitions as in fs.h, but the names are ++ * changed so that this file can be included in both kernel and user files. ++ */ ++ ++#define HOSTFS_ATTR_MODE 1 ++#define HOSTFS_ATTR_UID 2 ++#define HOSTFS_ATTR_GID 4 ++#define HOSTFS_ATTR_SIZE 8 ++#define HOSTFS_ATTR_ATIME 16 ++#define HOSTFS_ATTR_MTIME 32 ++#define HOSTFS_ATTR_CTIME 64 ++#define HOSTFS_ATTR_ATIME_SET 128 ++#define HOSTFS_ATTR_MTIME_SET 256 ++#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ ++#define HOSTFS_ATTR_ATTR_FLAG 1024 ++ ++struct hostfs_iattr { ++ unsigned int ia_valid; ++ mode_t ia_mode; ++ uid_t ia_uid; ++ gid_t ia_gid; ++ loff_t ia_size; ++ struct timespec ia_atime; ++ struct timespec ia_mtime; ++ struct timespec ia_ctime; ++ unsigned int ia_attr_flags; ++}; ++ ++extern int stat_file(const char *path, unsigned long long *inode_out, ++ int *mode_out, int *nlink_out, int *uid_out, int *gid_out, ++ unsigned long long *size_out, struct timespec *atime_out, ++ struct timespec *mtime_out, struct timespec *ctime_out, ++ int *blksize_out, unsigned long long *blocks_out); ++extern int access_file(char *path, int r, int w, int x); ++extern int open_file(char *path, int r, int w, int append); ++extern int file_type(const char *path, int *rdev); ++extern void *open_dir(char *path, int *err_out); ++extern char *read_dir(void *stream, unsigned long long *pos, ++ unsigned long long *ino_out, int *len_out); ++extern void close_file(void *stream); ++extern void close_dir(void *stream); ++extern int read_file(int fd, unsigned long long *offset, char *buf, int len); ++extern int write_file(int fd, unsigned long long *offset, const char *buf, ++ int len); ++extern int lseek_file(int fd, long long offset, int whence); ++extern int file_create(char *name, int ur, int uw, int ux, int gr, ++ int gw, int gx, int or, int ow, int ox); ++extern int set_attr(const char *file, struct hostfs_iattr *attrs); ++extern int make_symlink(const char *from, const char *to); ++extern int unlink_file(const char *file); ++extern int do_mkdir(const char *file, int mode); ++extern int do_rmdir(const char *file); ++extern int do_mknod(const char *file, int mode, int dev); ++extern int link_file(const char *from, const char *to); ++extern int do_readlink(char *file, char *buf, int size); ++extern int rename_file(char *from, char *to); ++extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, ++ long long *bfree_out, long long *bavail_out, ++ long long *files_out, long long *ffree_out, ++ void *fsid_out, int fsid_size, long *namelen_out, ++ long *spare_out); ++ ++#endif ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -puN /dev/null fs/hostfs/hostfs_kern.c +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/fs/hostfs/hostfs_kern.c Tue Oct 21 16:42:38 2003 +@@ -0,0 +1,1008 @@ ++/* ++ * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ * ++ * Ported the filesystem routines to 2.5. ++ * 2003-02-10 Petr Baudis <pasky@ucw.cz> ++ */ ++ ++#include <linux/stddef.h> ++#include <linux/fs.h> ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/slab.h> ++#include <linux/pagemap.h> ++#include <linux/blkdev.h> ++#include <linux/list.h> ++#include <linux/buffer_head.h> ++#include <linux/root_dev.h> ++#include <linux/statfs.h> ++#include <asm/uaccess.h> ++#include "hostfs.h" ++#include "kern_util.h" ++#include "kern.h" ++#include "user_util.h" ++#include "2_5compat.h" ++#include "init.h" ++ ++struct hostfs_inode_info { ++ char *host_filename; ++ int fd; ++ int mode; ++ struct inode vfs_inode; ++}; ++ ++static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) ++{ ++ return(list_entry(inode, struct hostfs_inode_info, vfs_inode)); ++} ++ ++#define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_dentry->d_inode) ++ ++int hostfs_d_delete(struct dentry *dentry) ++{ ++ return(1); ++} ++ ++struct dentry_operations hostfs_dentry_ops = { ++ .d_delete = hostfs_d_delete, ++}; ++ ++/* Changed in hostfs_args before the kernel starts running */ ++static char *root_ino = "/"; ++static int append = 0; ++ ++#define HOSTFS_SUPER_MAGIC 0x00c0ffee ++ ++static struct inode_operations hostfs_iops; ++static struct inode_operations hostfs_dir_iops; ++static struct address_space_operations hostfs_link_aops; ++ ++static int __init hostfs_args(char *options, int *add) ++{ ++ char *ptr; ++ ++ ptr = strchr(options, ','); ++ if(ptr != NULL) ++ *ptr++ = '\0'; ++ if(*options != '\0') ++ root_ino = options; ++ ++ options = ptr; ++ while(options){ ++ ptr = strchr(options, ','); ++ if(ptr != NULL) ++ *ptr++ = '\0'; ++ if(*options != '\0'){ ++ if(!strcmp(options, "append")) ++ append = 1; ++ else printf("hostfs_args - unsupported option - %s\n", ++ options); ++ } ++ options = ptr; ++ } ++ return(0); ++} ++ ++__uml_setup("hostfs=", hostfs_args, ++"hostfs=<root dir>,<flags>,...\n" ++" This is used to set hostfs parameters. The root directory argument\n" ++" is used to confine all hostfs mounts to within the specified directory\n" ++" tree on the host. If this isn't specified, then a user inside UML can\n" ++" mount anything on the host that's accessible to the user that's running\n" ++" it.\n" ++" The only flag currently supported is 'append', which specifies that all\n" ++" files opened by hostfs will be opened in append mode.\n\n" ++); ++ ++static char *dentry_name(struct dentry *dentry, int extra) ++{ ++ struct dentry *parent; ++ char *root, *name; ++ int len; ++ ++ len = 0; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ len += parent->d_name.len + 1; ++ parent = parent->d_parent; ++ } ++ ++ root = HOSTFS_I(parent->d_inode)->host_filename; ++ len += strlen(root); ++ name = kmalloc(len + extra + 1, GFP_KERNEL); ++ if(name == NULL) return(NULL); ++ ++ name[len] = '\0'; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ len -= parent->d_name.len + 1; ++ name[len] = '/'; ++ strncpy(&name[len + 1], parent->d_name.name, ++ parent->d_name.len); ++ parent = parent->d_parent; ++ } ++ strncpy(name, root, strlen(root)); ++ return(name); ++} ++ ++static char *inode_name(struct inode *ino, int extra) ++{ ++ struct dentry *dentry; ++ ++ dentry = list_entry(ino->i_dentry.next, struct dentry, d_alias); ++ return(dentry_name(dentry, extra)); ++} ++ ++static int read_name(struct inode *ino, char *name) ++{ ++ /* The non-int inode fields are copied into ints by stat_file and ++ * then copied into the inode because passing the actual pointers ++ * in and having them treated as int * breaks on big-endian machines ++ */ ++ int err; ++ int i_mode, i_nlink, i_blksize; ++ unsigned long long i_size; ++ unsigned long long i_ino; ++ unsigned long long i_blocks; ++ ++ err = stat_file(name, &i_ino, &i_mode, &i_nlink, &ino->i_uid, ++ &ino->i_gid, &i_size, &ino->i_atime, &ino->i_mtime, ++ &ino->i_ctime, &i_blksize, &i_blocks); ++ if(err) ++ return(err); ++ ++ ino->i_ino = i_ino; ++ ino->i_mode = i_mode; ++ ino->i_nlink = i_nlink; ++ ino->i_size = i_size; ++ ino->i_blksize = i_blksize; ++ ino->i_blocks = i_blocks; ++ if((ino->i_sb->s_dev == ROOT_DEV) && (ino->i_uid == getuid())) ++ ino->i_uid = 0; ++ return(0); ++} ++ ++static char *follow_link(char *link) ++{ ++ int len, n; ++ char *name, *resolved, *end; ++ ++ len = 64; ++ while(1){ ++ n = -ENOMEM; ++ name = kmalloc(len, GFP_KERNEL); ++ if(name == NULL) ++ goto out; ++ ++ n = do_readlink(link, name, len); ++ if(n < len) ++ break; ++ len *= 2; ++ kfree(name); ++ } ++ if(n < 0) ++ goto out_free; ++ ++ if(*name == '/') ++ return(name); ++ ++ end = strrchr(link, '/'); ++ if(end == NULL) ++ return(name); ++ ++ *(end + 1) = '\0'; ++ len = strlen(link) + strlen(name) + 1; ++ ++ resolved = kmalloc(len, GFP_KERNEL); ++ if(resolved == NULL){ ++ n = -ENOMEM; ++ goto out_free; ++ } ++ ++ sprintf(resolved, "%s%s", link, name); ++ kfree(name); ++ kfree(link); ++ return(resolved); ++ ++ out_free: ++ kfree(name); ++ out: ++ return(ERR_PTR(n)); ++} ++ ++static int read_inode(struct inode *ino) ++{ ++ char *name; ++ int err = 0; ++ ++ /* Unfortunately, we are called from iget() when we don't have a dentry ++ * allocated yet. ++ */ ++ if(list_empty(&ino->i_dentry)) ++ goto out; ++ ++ err = -ENOMEM; ++ name = inode_name(ino, 0); ++ if(name == NULL) ++ goto out; ++ ++ if(file_type(name, NULL) == OS_TYPE_SYMLINK){ ++ name = follow_link(name); ++ if(IS_ERR(name)){ ++ err = PTR_ERR(name); ++ goto out; ++ } ++ } ++ ++ err = read_name(ino, name); ++ kfree(name); ++ out: ++ return(err); ++} ++ ++int hostfs_statfs(struct super_block *sb, struct kstatfs *sf) ++{ ++ /* do_statfs uses struct statfs64 internally, but the linux kernel ++ * struct statfs still has 32-bit versions for most of these fields, ++ * so we convert them here ++ */ ++ int err; ++ long long f_blocks; ++ long long f_bfree; ++ long long f_bavail; ++ long long f_files; ++ long long f_ffree; ++ ++ err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename, ++ &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, ++ &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), ++ &sf->f_namelen, sf->f_spare); ++ if(err) return(err); ++ sf->f_blocks = f_blocks; ++ sf->f_bfree = f_bfree; ++ sf->f_bavail = f_bavail; ++ sf->f_files = f_files; ++ sf->f_ffree = f_ffree; ++ sf->f_type = HOSTFS_SUPER_MAGIC; ++ return(0); ++} ++ ++static struct inode *hostfs_alloc_inode(struct super_block *sb) ++{ ++ struct hostfs_inode_info *hi; ++ ++ hi = kmalloc(sizeof(*hi), GFP_KERNEL); ++ if(hi == NULL) ++ return(NULL); ++ ++ *hi = ((struct hostfs_inode_info) { .host_filename = NULL, ++ .fd = -1, ++ .mode = 0 }); ++ inode_init_once(&hi->vfs_inode); ++ return(&hi->vfs_inode); ++} ++ ++static void hostfs_destroy_inode(struct inode *inode) ++{ ++ if(HOSTFS_I(inode)->host_filename) ++ kfree(HOSTFS_I(inode)->host_filename); ++ ++ if(HOSTFS_I(inode)->fd != -1) ++ close_file(&HOSTFS_I(inode)->fd); ++ ++ kfree(HOSTFS_I(inode)); ++} ++ ++static void hostfs_read_inode(struct inode *inode) ++{ ++ read_inode(inode); ++} ++ ++static struct super_operations hostfs_sbops = { ++ .alloc_inode = hostfs_alloc_inode, ++ .destroy_inode = hostfs_destroy_inode, ++ .read_inode = hostfs_read_inode, ++ .statfs = hostfs_statfs, ++}; ++ ++int hostfs_readdir(struct file *file, void *ent, filldir_t filldir) ++{ ++ void *dir; ++ char *name; ++ unsigned long long next, ino; ++ int error, len; ++ ++ name = dentry_name(file->f_dentry, 0); ++ if(name == NULL) return(-ENOMEM); ++ dir = open_dir(name, &error); ++ kfree(name); ++ if(dir == NULL) return(-error); ++ next = file->f_pos; ++ while((name = read_dir(dir, &next, &ino, &len)) != NULL){ ++ error = (*filldir)(ent, name, len, file->f_pos, ++ ino, DT_UNKNOWN); ++ if(error) break; ++ file->f_pos = next; ++ } ++ close_dir(dir); ++ return(0); ++} ++ ++int hostfs_file_open(struct inode *ino, struct file *file) ++{ ++ char *name; ++ int mode = 0, r = 0, w = 0, fd; ++ ++ mode = file->f_mode & (FMODE_READ | FMODE_WRITE); ++ if((mode & HOSTFS_I(ino)->mode) == mode) ++ return(0); ++ ++ /* The file may already have been opened, but with the wrong access, ++ * so this resets things and reopens the file with the new access. ++ */ ++ if(HOSTFS_I(ino)->fd != -1){ ++ close_file(&HOSTFS_I(ino)->fd); ++ HOSTFS_I(ino)->fd = -1; ++ } ++ ++ HOSTFS_I(ino)->mode |= mode; ++ if(HOSTFS_I(ino)->mode & FMODE_READ) ++ r = 1; ++ if(HOSTFS_I(ino)->mode & FMODE_WRITE) ++ w = 1; ++ if(w) ++ r = 1; ++ ++ name = dentry_name(file->f_dentry, 0); ++ if(name == NULL) ++ return(-ENOMEM); ++ ++ fd = open_file(name, r, w, append); ++ kfree(name); ++ if(fd < 0) return(fd); ++ FILE_HOSTFS_I(file)->fd = fd; ++ ++ return(0); ++} ++ ++int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) ++{ ++ return(0); ++} ++ ++static struct file_operations hostfs_file_fops = { ++ .llseek = generic_file_llseek, ++ .read = generic_file_read, ++ .write = generic_file_write, ++ .mmap = generic_file_mmap, ++ .open = hostfs_file_open, ++ .release = NULL, ++ .fsync = hostfs_fsync, ++}; ++ ++static struct file_operations hostfs_dir_fops = { ++ .readdir = hostfs_readdir, ++ .read = generic_read_dir, ++}; ++ ++int hostfs_writepage(struct page *page, struct writeback_control *wbc) ++{ ++ struct address_space *mapping = page->mapping; ++ struct inode *inode = mapping->host; ++ char *buffer; ++ unsigned long long base; ++ int count = PAGE_CACHE_SIZE; ++ int end_index = inode->i_size >> PAGE_CACHE_SHIFT; ++ int err; ++ ++ if (page->index >= end_index) ++ count = inode->i_size & (PAGE_CACHE_SIZE-1); ++ ++ buffer = kmap(page); ++ base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT; ++ ++ err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); ++ if(err != count){ ++ ClearPageUptodate(page); ++ goto out; ++ } ++ ++ if (base > inode->i_size) ++ inode->i_size = base; ++ ++ if (PageError(page)) ++ ClearPageError(page); ++ err = 0; ++ ++ out: ++ kunmap(page); ++ ++ unlock_page(page); ++ return err; ++} ++ ++int hostfs_readpage(struct file *file, struct page *page) ++{ ++ char *buffer; ++ long long start; ++ int err = 0; ++ ++ start = (long long) page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, ++ PAGE_CACHE_SIZE); ++ if(err < 0) goto out; ++ ++ memset(&buffer[err], 0, PAGE_CACHE_SIZE - err); ++ ++ flush_dcache_page(page); ++ SetPageUptodate(page); ++ if (PageError(page)) ClearPageError(page); ++ err = 0; ++ out: ++ kunmap(page); ++ unlock_page(page); ++ return(err); ++} ++ ++int hostfs_prepare_write(struct file *file, struct page *page, ++ unsigned int from, unsigned int to) ++{ ++ char *buffer; ++ long long start, tmp; ++ int err; ++ ++ start = (long long) page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ if(from != 0){ ++ tmp = start; ++ err = read_file(FILE_HOSTFS_I(file)->fd, &tmp, buffer, ++ from); ++ if(err < 0) goto out; ++ } ++ if(to != PAGE_CACHE_SIZE){ ++ start += to; ++ err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer + to, ++ PAGE_CACHE_SIZE - to); ++ if(err < 0) goto out; ++ } ++ err = 0; ++ out: ++ kunmap(page); ++ return(err); ++} ++ ++int hostfs_commit_write(struct file *file, struct page *page, unsigned from, ++ unsigned to) ++{ ++ struct address_space *mapping = page->mapping; ++ struct inode *inode = mapping->host; ++ char *buffer; ++ long long start; ++ int err = 0; ++ ++ start = (long long) (page->index << PAGE_CACHE_SHIFT) + from; ++ buffer = kmap(page); ++ err = write_file(FILE_HOSTFS_I(file)->fd, &start, buffer + from, ++ to - from); ++ if(err > 0) err = 0; ++ if(!err && (start > inode->i_size)) ++ inode->i_size = start; ++ ++ kunmap(page); ++ return(err); ++} ++ ++static struct address_space_operations hostfs_aops = { ++ .writepage = hostfs_writepage, ++ .readpage = hostfs_readpage, ++/* .set_page_dirty = __set_page_dirty_nobuffers, */ ++ .prepare_write = hostfs_prepare_write, ++ .commit_write = hostfs_commit_write ++}; ++ ++static int init_inode(struct inode *inode, struct dentry *dentry) ++{ ++ char *name; ++ int type, err = -ENOMEM, rdev; ++ ++ if(dentry){ ++ name = dentry_name(dentry, 0); ++ if(name == NULL) ++ goto out; ++ type = file_type(name, &rdev); ++ kfree(name); ++ } ++ else type = OS_TYPE_DIR; ++ ++ err = 0; ++ if(type == OS_TYPE_SYMLINK) ++ inode->i_op = &page_symlink_inode_operations; ++ else if(type == OS_TYPE_DIR) ++ inode->i_op = &hostfs_dir_iops; ++ else inode->i_op = &hostfs_iops; ++ ++ if(type == OS_TYPE_DIR) inode->i_fop = &hostfs_dir_fops; ++ else inode->i_fop = &hostfs_file_fops; ++ ++ if(type == OS_TYPE_SYMLINK) ++ inode->i_mapping->a_ops = &hostfs_link_aops; ++ else inode->i_mapping->a_ops = &hostfs_aops; ++ ++ switch (type) { ++ case OS_TYPE_CHARDEV: ++ init_special_inode(inode, S_IFCHR, rdev); ++ break; ++ case OS_TYPE_BLOCKDEV: ++ init_special_inode(inode, S_IFBLK, rdev); ++ break; ++ case OS_TYPE_FIFO: ++ init_special_inode(inode, S_IFIFO, 0); ++ break; ++ case OS_TYPE_SOCK: ++ init_special_inode(inode, S_IFSOCK, 0); ++ break; ++ } ++ out: ++ return(err); ++} ++ ++int hostfs_create(struct inode *dir, struct dentry *dentry, int mode, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ char *name; ++ int error, fd; ++ ++ error = -ENOMEM; ++ inode = iget(dir->i_sb, 0); ++ if(inode == NULL) goto out; ++ ++ error = init_inode(inode, dentry); ++ if(error) ++ goto out_put; ++ ++ error = -ENOMEM; ++ name = dentry_name(dentry, 0); ++ if(name == NULL) ++ goto out_put; ++ ++ fd = file_create(name, ++ mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, ++ mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, ++ mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH); ++ if(fd < 0) ++ error = fd; ++ else error = read_name(inode, name); ++ ++ kfree(name); ++ if(error) ++ goto out_put; ++ ++ HOSTFS_I(inode)->fd = fd; ++ HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE; ++ d_instantiate(dentry, inode); ++ return(0); ++ ++ out_put: ++ iput(inode); ++ out: ++ return(error); ++} ++ ++struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct inode *inode; ++ char *name; ++ int err; ++ ++ err = -ENOMEM; ++ inode = iget(ino->i_sb, 0); ++ if(inode == NULL) ++ goto out; ++ ++ err = init_inode(inode, dentry); ++ if(err) ++ goto out_put; ++ ++ err = -ENOMEM; ++ name = dentry_name(dentry, 0); ++ if(name == NULL) ++ goto out_put; ++ ++ err = read_name(inode, name); ++ kfree(name); ++ if(err == -ENOENT){ ++ iput(inode); ++ inode = NULL; ++ } ++ else if(err) ++ goto out_put; ++ ++ d_add(dentry, inode); ++ dentry->d_op = &hostfs_dentry_ops; ++ return(NULL); ++ ++ out_put: ++ iput(inode); ++ out: ++ return(ERR_PTR(err)); ++} ++ ++static char *inode_dentry_name(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int len; ++ ++ file = inode_name(ino, dentry->d_name.len + 1); ++ if(file == NULL) return(NULL); ++ strcat(file, "/"); ++ len = strlen(file); ++ strncat(file, dentry->d_name.name, dentry->d_name.len); ++ file[len + dentry->d_name.len] = '\0'; ++ return(file); ++} ++ ++int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from) ++{ ++ char *from_name, *to_name; ++ int err; ++ ++ if((from_name = inode_dentry_name(ino, from)) == NULL) ++ return(-ENOMEM); ++ to_name = dentry_name(to, 0); ++ if(to_name == NULL){ ++ kfree(from_name); ++ return(-ENOMEM); ++ } ++ err = link_file(to_name, from_name); ++ kfree(from_name); ++ kfree(to_name); ++ return(err); ++} ++ ++int hostfs_unlink(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ if(append) ++ return(-EPERM); ++ ++ err = unlink_file(file); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = make_symlink(file, to); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_mkdir(struct inode *ino, struct dentry *dentry, int mode) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = do_mkdir(file, mode); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_rmdir(struct inode *ino, struct dentry *dentry) ++{ ++ char *file; ++ int err; ++ ++ if((file = inode_dentry_name(ino, dentry)) == NULL) return(-ENOMEM); ++ err = do_rmdir(file); ++ kfree(file); ++ return(err); ++} ++ ++int hostfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) ++{ ++ struct inode *inode; ++ char *name; ++ int err = -ENOMEM; ++ ++ inode = iget(dir->i_sb, 0); ++ if(inode == NULL) ++ goto out; ++ ++ err = init_inode(inode, dentry); ++ if(err) ++ goto out_put; ++ ++ err = -ENOMEM; ++ name = dentry_name(dentry, 0); ++ if(name == NULL) ++ goto out_put; ++ ++ init_special_inode(inode, mode, dev); ++ err = do_mknod(name, mode, dev); ++ if(err) ++ goto out_free; ++ ++ err = read_name(inode, name); ++ kfree(name); ++ if(err) ++ goto out_put; ++ ++ d_instantiate(dentry, inode); ++ return(0); ++ ++ out_free: ++ kfree(name); ++ out_put: ++ iput(inode); ++ out: ++ return(err); ++} ++ ++int hostfs_rename(struct inode *from_ino, struct dentry *from, ++ struct inode *to_ino, struct dentry *to) ++{ ++ char *from_name, *to_name; ++ int err; ++ ++ if((from_name = inode_dentry_name(from_ino, from)) == NULL) ++ return(-ENOMEM); ++ if((to_name = inode_dentry_name(to_ino, to)) == NULL){ ++ kfree(from_name); ++ return(-ENOMEM); ++ } ++ err = rename_file(from_name, to_name); ++ kfree(from_name); ++ kfree(to_name); ++ return(err); ++} ++ ++void hostfs_truncate(struct inode *ino) ++{ ++ not_implemented(); ++} ++ ++int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd) ++{ ++ char *name; ++ int r = 0, w = 0, x = 0, err; ++ ++ if(desired & MAY_READ) r = 1; ++ if(desired & MAY_WRITE) w = 1; ++ if(desired & MAY_EXEC) x = 1; ++ name = inode_name(ino, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = access_file(name, r, w, x); ++ kfree(name); ++ if(!err) err = vfs_permission(ino, desired); ++ return(err); ++} ++ ++int hostfs_setattr(struct dentry *dentry, struct iattr *attr) ++{ ++ struct hostfs_iattr attrs; ++ char *name; ++ int err; ++ ++ if(append) ++ attr->ia_valid &= ~ATTR_SIZE; ++ ++ attrs.ia_valid = 0; ++ if(attr->ia_valid & ATTR_MODE){ ++ attrs.ia_valid |= HOSTFS_ATTR_MODE; ++ attrs.ia_mode = attr->ia_mode; ++ } ++ if(attr->ia_valid & ATTR_UID){ ++ if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && ++ (attr->ia_uid == 0)) ++ attr->ia_uid = getuid(); ++ attrs.ia_valid |= HOSTFS_ATTR_UID; ++ attrs.ia_uid = attr->ia_uid; ++ } ++ if(attr->ia_valid & ATTR_GID){ ++ if((dentry->d_inode->i_sb->s_dev == ROOT_DEV) && ++ (attr->ia_gid == 0)) ++ attr->ia_gid = getuid(); ++ attrs.ia_valid |= HOSTFS_ATTR_GID; ++ attrs.ia_gid = attr->ia_gid; ++ } ++ if(attr->ia_valid & ATTR_SIZE){ ++ attrs.ia_valid |= HOSTFS_ATTR_SIZE; ++ attrs.ia_size = attr->ia_size; ++ } ++ if(attr->ia_valid & ATTR_ATIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_ATIME; ++ attrs.ia_atime = attr->ia_atime; ++ } ++ if(attr->ia_valid & ATTR_MTIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_MTIME; ++ attrs.ia_mtime = attr->ia_mtime; ++ } ++ if(attr->ia_valid & ATTR_CTIME){ ++ attrs.ia_valid |= HOSTFS_ATTR_CTIME; ++ attrs.ia_ctime = attr->ia_ctime; ++ } ++ if(attr->ia_valid & ATTR_ATIME_SET){ ++ attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET; ++ } ++ if(attr->ia_valid & ATTR_MTIME_SET){ ++ attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET; ++ } ++ name = dentry_name(dentry, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = set_attr(name, &attrs); ++ kfree(name); ++ if(err) ++ return(err); ++ ++ return(inode_setattr(dentry->d_inode, attr)); ++} ++ ++int hostfs_getattr(struct vfsmount *mnt, struct dentry *dentry, ++ struct kstat *stat) ++{ ++ generic_fillattr(dentry->d_inode, stat); ++ return(0); ++} ++ ++static struct inode_operations hostfs_iops = { ++ .create = hostfs_create, ++ .link = hostfs_link, ++ .unlink = hostfs_unlink, ++ .symlink = hostfs_symlink, ++ .mkdir = hostfs_mkdir, ++ .rmdir = hostfs_rmdir, ++ .mknod = hostfs_mknod, ++ .rename = hostfs_rename, ++ .truncate = hostfs_truncate, ++ .permission = hostfs_permission, ++ .setattr = hostfs_setattr, ++ .getattr = hostfs_getattr, ++}; ++ ++static struct inode_operations hostfs_dir_iops = { ++ .create = hostfs_create, ++ .lookup = hostfs_lookup, ++ .link = hostfs_link, ++ .unlink = hostfs_unlink, ++ .symlink = hostfs_symlink, ++ .mkdir = hostfs_mkdir, ++ .rmdir = hostfs_rmdir, ++ .mknod = hostfs_mknod, ++ .rename = hostfs_rename, ++ .truncate = hostfs_truncate, ++ .permission = hostfs_permission, ++ .setattr = hostfs_setattr, ++ .getattr = hostfs_getattr, ++}; ++ ++int hostfs_link_readpage(struct file *file, struct page *page) ++{ ++ char *buffer, *name; ++ long long start; ++ int err; ++ ++ start = page->index << PAGE_CACHE_SHIFT; ++ buffer = kmap(page); ++ name = inode_name(page->mapping->host, 0); ++ if(name == NULL) return(-ENOMEM); ++ err = do_readlink(name, buffer, PAGE_CACHE_SIZE); ++ kfree(name); ++ if(err == PAGE_CACHE_SIZE) ++ err = -E2BIG; ++ else if(err > 0){ ++ flush_dcache_page(page); ++ SetPageUptodate(page); ++ if (PageError(page)) ClearPageError(page); ++ err = 0; ++ } ++ kunmap(page); ++ unlock_page(page); ++ return(err); ++} ++ ++static struct address_space_operations hostfs_link_aops = { ++ .readpage = hostfs_link_readpage, ++}; ++ ++static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) ++{ ++ struct inode *root_inode; ++ char *name, *data = d; ++ int err; ++ ++ sb->s_blocksize = 1024; ++ sb->s_blocksize_bits = 10; ++ sb->s_magic = HOSTFS_SUPER_MAGIC; ++ sb->s_op = &hostfs_sbops; ++ ++ if((data == NULL) || (*data == '\0')) ++ data = root_ino; ++ ++ err = -ENOMEM; ++ name = kmalloc(strlen(data) + 1, GFP_KERNEL); ++ if(name == NULL) ++ goto out; ++ ++ strcpy(name, data); ++ ++ root_inode = iget(sb, 0); ++ if(root_inode == NULL) ++ goto out_free; ++ ++ err = init_inode(root_inode, NULL); ++ if(err) ++ goto out_put; ++ ++ HOSTFS_I(root_inode)->host_filename = name; ++ ++ err = -ENOMEM; ++ sb->s_root = d_alloc_root(root_inode); ++ if(sb->s_root == NULL) ++ goto out_put; ++ ++ err = read_inode(root_inode); ++ if(err) ++ goto out_put; ++ ++ return(0); ++ ++ out_put: ++ iput(root_inode); ++ out_free: ++ kfree(name); ++ out: ++ return(err); ++} ++ ++static struct super_block *hostfs_read_sb(struct file_system_type *type, ++ int flags, const char *dev_name, ++ void *data) ++{ ++ return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common)); ++} ++ ++static struct file_system_type hostfs_type = { ++ .owner = THIS_MODULE, ++ .name = "hostfs", ++ .get_sb = hostfs_read_sb, ++ .kill_sb = kill_anon_super, ++ .fs_flags = 0, ++}; ++ ++static int __init init_hostfs(void) ++{ ++ return(register_filesystem(&hostfs_type)); ++} ++ ++static void __exit exit_hostfs(void) ++{ ++ unregister_filesystem(&hostfs_type); ++} ++ ++module_init(init_hostfs) ++module_exit(exit_hostfs) ++MODULE_LICENSE("GPL"); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -puN /dev/null fs/hostfs/hostfs_user.c +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/fs/hostfs/hostfs_user.c Tue Oct 21 16:42:38 2003 +@@ -0,0 +1,361 @@ ++/* ++ * Copyright (C) 2000 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <unistd.h> ++#include <stdio.h> ++#include <fcntl.h> ++#include <dirent.h> ++#include <errno.h> ++#include <utime.h> ++#include <string.h> ++#include <sys/stat.h> ++#include <sys/time.h> ++#include <sys/vfs.h> ++#include "hostfs.h" ++#include "kern_util.h" ++#include "user.h" ++ ++int stat_file(const char *path, unsigned long long *inode_out, int *mode_out, ++ int *nlink_out, int *uid_out, int *gid_out, ++ unsigned long long *size_out, struct timespec *atime_out, ++ struct timespec *mtime_out, struct timespec *ctime_out, ++ int *blksize_out, unsigned long long *blocks_out) ++{ ++ struct stat64 buf; ++ ++ if(lstat64(path, &buf) < 0) ++ return(-errno); ++ ++ /* See the Makefile for why STAT64_INO_FIELD is passed in ++ * by the build ++ */ ++ if(inode_out != NULL) *inode_out = buf.STAT64_INO_FIELD; ++ if(mode_out != NULL) *mode_out = buf.st_mode; ++ if(nlink_out != NULL) *nlink_out = buf.st_nlink; ++ if(uid_out != NULL) *uid_out = buf.st_uid; ++ if(gid_out != NULL) *gid_out = buf.st_gid; ++ if(size_out != NULL) *size_out = buf.st_size; ++ if(atime_out != NULL) { ++ atime_out->tv_sec = buf.st_atime; ++ atime_out->tv_nsec = 0; ++ } ++ if(mtime_out != NULL) { ++ mtime_out->tv_sec = buf.st_mtime; ++ mtime_out->tv_nsec = 0; ++ } ++ if(ctime_out != NULL) { ++ ctime_out->tv_sec = buf.st_ctime; ++ ctime_out->tv_nsec = 0; ++ } ++ if(blksize_out != NULL) *blksize_out = buf.st_blksize; ++ if(blocks_out != NULL) *blocks_out = buf.st_blocks; ++ return(0); ++} ++ ++int file_type(const char *path, int *rdev) ++{ ++ struct stat64 buf; ++ ++ if(lstat64(path, &buf) < 0) ++ return(-errno); ++ if(rdev != NULL) ++ *rdev = buf.st_rdev; ++ ++ if(S_ISDIR(buf.st_mode)) return(OS_TYPE_DIR); ++ else if(S_ISLNK(buf.st_mode)) return(OS_TYPE_SYMLINK); ++ else if(S_ISCHR(buf.st_mode)) return(OS_TYPE_CHARDEV); ++ else if(S_ISBLK(buf.st_mode)) return(OS_TYPE_BLOCKDEV); ++ else if(S_ISFIFO(buf.st_mode))return(OS_TYPE_FIFO); ++ else if(S_ISSOCK(buf.st_mode))return(OS_TYPE_SOCK); ++ else return(OS_TYPE_FILE); ++} ++ ++int access_file(char *path, int r, int w, int x) ++{ ++ int mode = 0; ++ ++ if(r) mode = R_OK; ++ if(w) mode |= W_OK; ++ if(x) mode |= X_OK; ++ if(access(path, mode) != 0) return(-errno); ++ else return(0); ++} ++ ++int open_file(char *path, int r, int w, int append) ++{ ++ int mode = 0, fd; ++ ++ if(r && !w) ++ mode = O_RDONLY; ++ else if(!r && w) ++ mode = O_WRONLY; ++ else if(r && w) ++ mode = O_RDWR; ++ else panic("Impossible mode in open_file"); ++ ++ if(append) ++ mode |= O_APPEND; ++ fd = open64(path, mode); ++ if(fd < 0) return(-errno); ++ else return(fd); ++} ++ ++void *open_dir(char *path, int *err_out) ++{ ++ DIR *dir; ++ ++ dir = opendir(path); ++ *err_out = errno; ++ if(dir == NULL) return(NULL); ++ return(dir); ++} ++ ++char *read_dir(void *stream, unsigned long long *pos, ++ unsigned long long *ino_out, int *len_out) ++{ ++ DIR *dir = stream; ++ struct dirent *ent; ++ ++ seekdir(dir, *pos); ++ ent = readdir(dir); ++ if(ent == NULL) return(NULL); ++ *len_out = strlen(ent->d_name); ++ *ino_out = ent->d_ino; ++ *pos = telldir(dir); ++ return(ent->d_name); ++} ++ ++int read_file(int fd, unsigned long long *offset, char *buf, int len) ++{ ++ int n; ++ ++ n = pread64(fd, buf, len, *offset); ++ if(n < 0) return(-errno); ++ *offset += n; ++ return(n); ++} ++ ++int write_file(int fd, unsigned long long *offset, const char *buf, int len) ++{ ++ int n; ++ ++ n = pwrite64(fd, buf, len, *offset); ++ if(n < 0) return(-errno); ++ *offset += n; ++ return(n); ++} ++ ++int lseek_file(int fd, long long offset, int whence) ++{ ++ int ret; ++ ++ ret = lseek64(fd, offset, whence); ++ if(ret < 0) return(-errno); ++ return(0); ++} ++ ++void close_file(void *stream) ++{ ++ close(*((int *) stream)); ++} ++ ++void close_dir(void *stream) ++{ ++ closedir(stream); ++} ++ ++int file_create(char *name, int ur, int uw, int ux, int gr, ++ int gw, int gx, int or, int ow, int ox) ++{ ++ int mode, fd; ++ ++ mode = 0; ++ mode |= ur ? S_IRUSR : 0; ++ mode |= uw ? S_IWUSR : 0; ++ mode |= ux ? S_IXUSR : 0; ++ mode |= gr ? S_IRGRP : 0; ++ mode |= gw ? S_IWGRP : 0; ++ mode |= gx ? S_IXGRP : 0; ++ mode |= or ? S_IROTH : 0; ++ mode |= ow ? S_IWOTH : 0; ++ mode |= ox ? S_IXOTH : 0; ++ fd = open64(name, O_CREAT | O_RDWR, mode); ++ if(fd < 0) ++ return(-errno); ++ return(fd); ++} ++ ++int set_attr(const char *file, struct hostfs_iattr *attrs) ++{ ++ struct utimbuf buf; ++ int err, ma; ++ ++ if(attrs->ia_valid & HOSTFS_ATTR_MODE){ ++ if(chmod(file, attrs->ia_mode) != 0) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_UID){ ++ if(chown(file, attrs->ia_uid, -1)) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_GID){ ++ if(chown(file, -1, attrs->ia_gid)) return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_SIZE){ ++ if(truncate(file, attrs->ia_size)) return(-errno); ++ } ++ ma = HOSTFS_ATTR_ATIME_SET | HOSTFS_ATTR_MTIME_SET; ++ if((attrs->ia_valid & ma) == ma){ ++ buf.actime = attrs->ia_atime.tv_sec; ++ buf.modtime = attrs->ia_mtime.tv_sec; ++ if(utime(file, &buf) != 0) return(-errno); ++ } ++ else { ++ struct timespec ts; ++ ++ if(attrs->ia_valid & HOSTFS_ATTR_ATIME_SET){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, ++ NULL, NULL, &ts, NULL, NULL, NULL); ++ if(err != 0) ++ return(err); ++ buf.actime = attrs->ia_atime.tv_sec; ++ buf.modtime = ts.tv_sec; ++ if(utime(file, &buf) != 0) ++ return(-errno); ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_MTIME_SET){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, ++ NULL, &ts, NULL, NULL, NULL, NULL); ++ if(err != 0) ++ return(err); ++ buf.actime = ts.tv_sec; ++ buf.modtime = attrs->ia_mtime.tv_sec; ++ if(utime(file, &buf) != 0) ++ return(-errno); ++ } ++ } ++ if(attrs->ia_valid & HOSTFS_ATTR_CTIME) ; ++ if(attrs->ia_valid & (HOSTFS_ATTR_ATIME | HOSTFS_ATTR_MTIME)){ ++ err = stat_file(file, NULL, NULL, NULL, NULL, NULL, NULL, ++ &attrs->ia_atime, &attrs->ia_mtime, NULL, ++ NULL, NULL); ++ if(err != 0) return(err); ++ } ++ return(0); ++} ++ ++int make_symlink(const char *from, const char *to) ++{ ++ int err; ++ ++ err = symlink(to, from); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int unlink_file(const char *file) ++{ ++ int err; ++ ++ err = unlink(file); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_mkdir(const char *file, int mode) ++{ ++ int err; ++ ++ err = mkdir(file, mode); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_rmdir(const char *file) ++{ ++ int err; ++ ++ err = rmdir(file); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_mknod(const char *file, int mode, int dev) ++{ ++ int err; ++ ++ err = mknod(file, mode, dev); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int link_file(const char *to, const char *from) ++{ ++ int err; ++ ++ err = link(to, from); ++ if(err) return(-errno); ++ return(0); ++} ++ ++int do_readlink(char *file, char *buf, int size) ++{ ++ int n; ++ ++ n = readlink(file, buf, size); ++ if(n < 0) ++ return(-errno); ++ if(n < size) ++ buf[n] = '\0'; ++ return(n); ++} ++ ++int rename_file(char *from, char *to) ++{ ++ int err; ++ ++ err = rename(from, to); ++ if(err < 0) return(-errno); ++ return(0); ++} ++ ++int do_statfs(char *root, long *bsize_out, long long *blocks_out, ++ long long *bfree_out, long long *bavail_out, ++ long long *files_out, long long *ffree_out, ++ void *fsid_out, int fsid_size, long *namelen_out, ++ long *spare_out) ++{ ++ struct statfs64 buf; ++ int err; ++ ++ err = statfs64(root, &buf); ++ if(err < 0) return(-errno); ++ *bsize_out = buf.f_bsize; ++ *blocks_out = buf.f_blocks; ++ *bfree_out = buf.f_bfree; ++ *bavail_out = buf.f_bavail; ++ *files_out = buf.f_files; ++ *ffree_out = buf.f_ffree; ++ memcpy(fsid_out, &buf.f_fsid, ++ sizeof(buf.f_fsid) > fsid_size ? fsid_size : ++ sizeof(buf.f_fsid)); ++ *namelen_out = buf.f_namelen; ++ spare_out[0] = buf.f_spare[0]; ++ spare_out[1] = buf.f_spare[1]; ++ spare_out[2] = buf.f_spare[2]; ++ spare_out[3] = buf.f_spare[3]; ++ spare_out[4] = buf.f_spare[4]; ++ spare_out[5] = buf.f_spare[5]; ++ return(0); ++} ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ +diff -puN /dev/null fs/hppfs/Makefile +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/fs/hppfs/Makefile Tue Oct 21 16:42:38 2003 +@@ -0,0 +1,19 @@ ++# ++# Copyright (C) 2002, 2003 Jeff Dike (jdike@karaya.com) ++# Licensed under the GPL ++# ++ ++hppfs-objs := hppfs_kern.o ++ ++obj-y = ++obj-$(CONFIG_HPPFS) += hppfs.o ++ ++clean: ++ ++modules: ++ ++fastdep: ++ ++dep: ++ ++archmrproper: clean +diff -puN /dev/null fs/hppfs/hppfs_kern.c +--- /dev/null Sat Mar 23 22:46:34 2002 ++++ limbo-god/fs/hppfs/hppfs_kern.c Tue Oct 21 16:42:38 2003 +@@ -0,0 +1,811 @@ ++/* ++ * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) ++ * Licensed under the GPL ++ */ ++ ++#include <linux/fs.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/slab.h> ++#include <linux/list.h> ++#include <linux/kernel.h> ++#include <linux/ctype.h> ++#include <linux/dcache.h> ++#include <linux/statfs.h> ++#include <asm/uaccess.h> ++#include <asm/fcntl.h> ++#include "os.h" ++ ++static int init_inode(struct inode *inode, struct dentry *dentry); ++ ++struct hppfs_data { ++ struct list_head list; ++ char contents[PAGE_SIZE - sizeof(struct list_head)]; ++}; ++ ++struct hppfs_private { ++ struct file proc_file; ++ int host_fd; ++ loff_t len; ++ struct hppfs_data *contents; ++}; ++ ++struct hppfs_inode_info { ++ struct dentry *proc_dentry; ++ struct inode vfs_inode; ++}; ++ ++static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) ++{ ++ return(list_entry(inode, struct hppfs_inode_info, vfs_inode)); ++} ++ ++#define HPPFS_SUPER_MAGIC 0xb00000ee ++ ++static struct super_operations hppfs_sbops; ++ ++static int is_pid(struct dentry *dentry) ++{ ++ struct super_block *sb; ++ int i; ++ ++ sb = dentry->d_sb; ++ if((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root)) ++ return(0); ++ ++ for(i = 0; i < dentry->d_name.len; i++){ ++ if(!isdigit(dentry->d_name.name[i])) ++ return(0); ++ } ++ return(1); ++} ++ ++static char *dentry_name(struct dentry *dentry, int extra) ++{ ++ struct dentry *parent; ++ char *root, *name; ++ const char *seg_name; ++ int len, seg_len; ++ ++ len = 0; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ if(is_pid(parent)) ++ len += strlen("pid") + 1; ++ else len += parent->d_name.len + 1; ++ parent = parent->d_parent; ++ } ++ ++ root = "proc"; ++ len += strlen(root); ++ name = kmalloc(len + extra + 1, GFP_KERNEL); ++ if(name == NULL) return(NULL); ++ ++ name[len] = '\0'; ++ parent = dentry; ++ while(parent->d_parent != parent){ ++ if(is_pid(parent)){ ++ seg_name = "pid"; ++ seg_len = strlen("pid"); ++ } ++ else { ++ seg_name = parent->d_name.name; ++ seg_len = parent->d_name.len; ++ } ++ ++ len -= seg_len + 1; ++ name[len] = '/'; ++ strncpy(&name[len + 1], seg_name, seg_len); ++ parent = parent->d_parent; ++ } ++ strncpy(name, root, strlen(root)); ++ return(name); ++} ++ ++struct dentry_operations hppfs_dentry_ops = { ++}; ++ ++static int file_removed(struct dentry *dentry, const char *file) ++{ ++ char *host_file; ++ int extra, fd; ++ ++ extra = 0; ++ if(file != NULL) extra += strlen(file) + 1; ++ ++ host_file = dentry_name(dentry, extra + strlen("/remove")); ++ if(host_file == NULL){ ++ printk("file_removed : allocation failed\n"); ++ return(-ENOMEM); ++ } ++ ++ if(file != NULL){ ++ strcat(host_file, "/"); ++ strcat(host_file, file); ++ } ++ strcat(host_file, "/remove"); ++ ++ fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); ++ kfree(host_file); ++ if(fd > 0){ ++ os_close_file(fd); ++ return(1); ++ } ++ return(0); ++} ++ ++static void hppfs_read_inode(struct inode *ino) ++{ ++ struct inode *proc_ino; ++ ++ if(HPPFS_I(ino)->proc_dentry == NULL) ++ return; ++ ++ proc_ino = HPPFS_I(ino)->proc_dentry->d_inode; ++ ino->i_uid = proc_ino->i_uid; ++ ino->i_gid = proc_ino->i_gid; ++ ino->i_atime = proc_ino->i_atime; ++ ino->i_mtime = proc_ino->i_mtime; ++ ino->i_ctime = proc_ino->i_ctime; ++ ino->i_ino = proc_ino->i_ino; ++ ino->i_mode = proc_ino->i_mode; ++ ino->i_nlink = proc_ino->i_nlink; ++ ino->i_size = proc_ino->i_size; ++ ino->i_blksize = proc_ino->i_blksize; ++ ino->i_blocks = proc_ino->i_blocks; ++} ++ ++static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, ++ struct nameidata *nd) ++{ ++ struct dentry *proc_dentry, *new, *parent; ++ struct inode *inode; ++ int err, deleted; ++ ++ deleted = file_removed(dentry, NULL); ++ if(deleted < 0) ++ return(ERR_PTR(deleted)); ++ else if(deleted) ++ return(ERR_PTR(-ENOENT)); ++ ++ err = -ENOMEM; ++ parent = HPPFS_I(ino)->proc_dentry; ++ down(&parent->d_inode->i_sem); ++ proc_dentry = d_lookup(parent, &dentry->d_name); ++ if(proc_dentry == NULL){ ++ proc_dentry = d_alloc(parent, &dentry->d_name); ++ if(proc_dentry == NULL){ ++ up(&parent->d_inode->i_sem); ++ goto out; ++ } ++ new = (*parent->d_inode->i_op->lookup)(parent->d_inode, ++ proc_dentry, NULL); ++ if(new){ ++ dput(proc_dentry); ++ proc_dentry = new; ++ } ++ } ++ up(&parent->d_inode->i_sem); ++ ++ if(IS_ERR(proc_dentry)) ++ return(proc_dentry); ++ ++ inode = iget(ino->i_sb, 0); ++ if(inode == NULL) ++ goto out_dput; ++ ++ err = init_inode(inode, proc_dentry); ++ if(err) ++ goto out_put; ++ ++ hppfs_read_inode(inode); ++ ++ d_add(dentry, inode); ++ dentry->d_op = &hppfs_dentry_ops; ++ return(NULL); ++ ++ out_put: ++ iput(inode); ++ out_dput: ++ dput(proc_dentry); ++ out: ++ return(ERR_PTR(err)); ++} ++ ++static struct inode_operations hppfs_file_iops = { ++}; ++ ++static ssize_t read_proc(struct file *file, char *buf, ssize_t count, ++ loff_t *ppos, int is_user) ++{ ++ ssize_t (*read)(struct file *, char *, size_t, loff_t *); ++ ssize_t n; ++ ++ read = file->f_dentry->d_inode->i_fop->read; ++ ++ if(!is_user) ++ set_fs(KERNEL_DS); ++ ++ n = (*read)(file, buf, count, &file->f_pos); ++ ++ if(!is_user) ++ set_fs(USER_DS); ++ ++ if(ppos) *ppos = file->f_pos; ++ return(n); ++} ++ ++static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) ++{ ++ ssize_t n; ++ int cur, err; ++ char *new_buf; ++ ++ n = -ENOMEM; ++ new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if(new_buf == NULL){ ++ printk("hppfs_read_file : kmalloc failed\n"); ++ goto out; ++ } ++ n = 0; ++ while(count > 0){ ++ cur = min_t(ssize_t, count, PAGE_SIZE); ++ err = os_read_file(fd, new_buf, cur); ++ if(err < 0){ ++ printk("hppfs_read : read failed, errno = %d\n", ++ count); ++ n = err; ++ goto out_free; ++ } ++ else if(err == 0) ++ break; ++ ++ if(copy_to_user(buf, new_buf, err)){ ++ n = -EFAULT; ++ goto out_free; ++ } ++ n += err; ++ count -= err; ++ } ++ out_free: ++ kfree(new_buf); ++ out: ++ return(n); ++} ++ ++static ssize_t hppfs_read(struct file *file, char *buf, size_t count, ++ loff_t *ppos) ++{ ++ struct hppfs_private *hppfs = file->private_data; ++ struct hppfs_data *data; ++ loff_t off; ++ int err; ++ ++ if(hppfs->contents != NULL){ ++ if(*ppos >= hppfs->len) return(0); ++ ++ data = hppfs->contents; ++ off = *ppos; ++ while(off >= sizeof(data->contents)){ ++ data = list_entry(data->list.next, struct hppfs_data, ++ list); ++ off -= sizeof(data->contents); ++ } ++ ++ if(off + count > hppfs->len) ++ count = hppfs->len - off; ++ copy_to_user(buf, &data->contents[off], count); ++ *ppos += count; ++ } ++ else if(hppfs->host_fd != -1){ ++ err = os_seek_file(hppfs->host_fd, *ppos); ++ if(err){ ++ printk("hppfs_read : seek failed, errno = %d\n", err); ++ return(err); ++ } ++ count = hppfs_read_file(hppfs->host_fd, buf, count); ++ if(count > 0) ++ *ppos += count; ++ } ++ else count = read_proc(&hppfs->proc_file, buf, count, ppos, 1); ++ ++ return(count); ++} ++ ++static ssize_t hppfs_write(struct file *file, const char *buf, size_t len, ++ loff_t *ppos) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ ssize_t (*write)(struct file *, const char *, size_t, loff_t *); ++ int err; ++ ++ write = proc_file->f_dentry->d_inode->i_fop->write; ++ ++ proc_file->f_pos = file->f_pos; ++ err = (*write)(proc_file, buf, len, &proc_file->f_pos); ++ file->f_pos = proc_file->f_pos; ++ ++ return(err); ++} ++ ++static int open_host_sock(char *host_file, int *filter_out) ++{ ++ char *end; ++ int fd; ++ ++ end = &host_file[strlen(host_file)]; ++ strcpy(end, "/rw"); ++ *filter_out = 1; ++ fd = os_connect_socket(host_file); ++ if(fd > 0) ++ return(fd); ++ ++ strcpy(end, "/r"); ++ *filter_out = 0; ++ fd = os_connect_socket(host_file); ++ return(fd); ++} ++ ++static void free_contents(struct hppfs_data *head) ++{ ++ struct hppfs_data *data; ++ struct list_head *ele, *next; ++ ++ if(head == NULL) return; ++ ++ list_for_each_safe(ele, next, &head->list){ ++ data = list_entry(ele, struct hppfs_data, list); ++ kfree(data); ++ } ++ kfree(head); ++} ++ ++static struct hppfs_data *hppfs_get_data(int fd, int filter, ++ struct file *proc_file, ++ struct file *hppfs_file, ++ loff_t *size_out) ++{ ++ struct hppfs_data *data, *new, *head; ++ int n, err; ++ ++ err = -ENOMEM; ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(data == NULL){ ++ printk("hppfs_get_data : head allocation failed\n"); ++ goto failed; ++ } ++ ++ INIT_LIST_HEAD(&data->list); ++ ++ head = data; ++ *size_out = 0; ++ ++ if(filter){ ++ while((n = read_proc(proc_file, data->contents, ++ sizeof(data->contents), NULL, 0)) > 0) ++ os_write_file(fd, data->contents, n); ++ err = os_shutdown_socket(fd, 0, 1); ++ if(err){ ++ printk("hppfs_get_data : failed to shut down " ++ "socket\n"); ++ goto failed_free; ++ } ++ } ++ while(1){ ++ n = os_read_file(fd, data->contents, sizeof(data->contents)); ++ if(n < 0){ ++ err = n; ++ printk("hppfs_get_data : read failed, errno = %d\n", ++ err); ++ goto failed_free; ++ } ++ else if(n == 0) ++ break; ++ ++ *size_out += n; ++ ++ if(n < sizeof(data->contents)) ++ break; ++ ++ new = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(new == 0){ ++ printk("hppfs_get_data : data allocation failed\n"); ++ err = -ENOMEM; ++ goto failed_free; ++ } ++ ++ INIT_LIST_HEAD(&new->list); ++ list_add(&new->list, &data->list); ++ data = new; ++ } ++ return(head); ++ ++ failed_free: ++ free_contents(head); ++ failed: ++ return(ERR_PTR(err)); ++} ++ ++static struct hppfs_private *hppfs_data(void) ++{ ++ struct hppfs_private *data; ++ ++ data = kmalloc(sizeof(*data), GFP_KERNEL); ++ if(data == NULL) ++ return(data); ++ ++ *data = ((struct hppfs_private ) { .host_fd = -1, ++ .len = -1, ++ .contents = NULL } ); ++ return(data); ++} ++ ++static int file_mode(int fmode) ++{ ++ if(fmode == (FMODE_READ | FMODE_WRITE)) ++ return(O_RDWR); ++ if(fmode == FMODE_READ) ++ return(O_RDONLY); ++ if(fmode == FMODE_WRITE) ++ return(O_WRONLY); ++ return(0); ++} ++ ++static int hppfs_open(struct inode *inode, struct file *file) ++{ ++ struct hppfs_private *data; ++ struct dentry *proc_dentry; ++ char *host_file; ++ int err, fd, type, filter; ++ ++ err = -ENOMEM; ++ data = hppfs_data(); ++ if(data == NULL) ++ goto out; ++ ++ host_file = dentry_name(file->f_dentry, strlen("/rw")); ++ if(host_file == NULL) ++ goto out_free2; ++ ++ proc_dentry = HPPFS_I(inode)->proc_dentry; ++ ++ /* XXX This isn't closed anywhere */ ++ err = open_private_file(&data->proc_file, proc_dentry, ++ file_mode(file->f_mode)); ++ if(err) ++ goto out_free1; ++ ++ type = os_file_type(host_file); ++ if(type == OS_TYPE_FILE){ ++ fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); ++ if(fd >= 0) ++ data->host_fd = fd; ++ else printk("hppfs_open : failed to open '%s', errno = %d\n", ++ host_file, -fd); ++ ++ data->contents = NULL; ++ } ++ else if(type == OS_TYPE_DIR){ ++ fd = open_host_sock(host_file, &filter); ++ if(fd > 0){ ++ data->contents = hppfs_get_data(fd, filter, ++ &data->proc_file, ++ file, &data->len); ++ if(!IS_ERR(data->contents)) ++ data->host_fd = fd; ++ } ++ else printk("hppfs_open : failed to open a socket in " ++ "'%s', errno = %d\n", host_file, -fd); ++ } ++ kfree(host_file); ++ ++ file->private_data = data; ++ return(0); ++ ++ out_free1: ++ kfree(host_file); ++ out_free2: ++ free_contents(data->contents); ++ kfree(data); ++ out: ++ return(err); ++} ++ ++static int hppfs_dir_open(struct inode *inode, struct file *file) ++{ ++ struct hppfs_private *data; ++ struct dentry *proc_dentry; ++ int err; ++ ++ err = -ENOMEM; ++ data = hppfs_data(); ++ if(data == NULL) ++ goto out; ++ ++ proc_dentry = HPPFS_I(inode)->proc_dentry; ++ err = open_private_file(&data->proc_file, proc_dentry, ++ file_mode(file->f_mode)); ++ if(err) ++ goto out_free; ++ ++ file->private_data = data; ++ return(0); ++ ++ out_free: ++ kfree(data); ++ out: ++ return(err); ++} ++ ++static loff_t hppfs_llseek(struct file *file, loff_t off, int where) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ loff_t (*llseek)(struct file *, loff_t, int); ++ loff_t ret; ++ ++ llseek = proc_file->f_dentry->d_inode->i_fop->llseek; ++ if(llseek != NULL){ ++ ret = (*llseek)(proc_file, off, where); ++ if(ret < 0) ++ return(ret); ++ } ++ ++ return(default_llseek(file, off, where)); ++} ++ ++static struct file_operations hppfs_file_fops = { ++ .owner = NULL, ++ .llseek = hppfs_llseek, ++ .read = hppfs_read, ++ .write = hppfs_write, ++ .open = hppfs_open, ++}; ++ ++struct hppfs_dirent { ++ void *vfs_dirent; ++ filldir_t filldir; ++ struct dentry *dentry; ++}; ++ ++static int hppfs_filldir(void *d, const char *name, int size, ++ loff_t offset, ino_t inode, unsigned int type) ++{ ++ struct hppfs_dirent *dirent = d; ++ ++ if(file_removed(dirent->dentry, name)) ++ return(0); ++ ++ return((*dirent->filldir)(dirent->vfs_dirent, name, size, offset, ++ inode, type)); ++} ++ ++static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir) ++{ ++ struct hppfs_private *data = file->private_data; ++ struct file *proc_file = &data->proc_file; ++ int (*readdir)(struct file *, void *, filldir_t); ++ struct hppfs_dirent dirent = ((struct hppfs_dirent) ++ { .vfs_dirent = ent, ++ .filldir = filldir, ++ .dentry = file->f_dentry } ); ++ int err; ++ ++ readdir = proc_file->f_dentry->d_inode->i_fop->readdir; ++ ++ proc_file->f_pos = file->f_pos; ++ err = (*readdir)(proc_file, &dirent, hppfs_filldir); ++ file->f_pos = proc_file->f_pos; ++ ++ return(err); ++} ++ ++static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) ++{ ++ return(0); ++} ++ ++static struct file_operations hppfs_dir_fops = { ++ .owner = NULL, ++ .readdir = hppfs_readdir, ++ .open = hppfs_dir_open, ++ .fsync = hppfs_fsync, ++}; ++ ++static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf) ++{ ++ sf->f_blocks = 0; ++ sf->f_bfree = 0; ++ sf->f_bavail = 0; ++ sf->f_files = 0; ++ sf->f_ffree = 0; ++ sf->f_type = HPPFS_SUPER_MAGIC; ++ return(0); ++} ++ ++static struct inode *hppfs_alloc_inode(struct super_block *sb) ++{ ++ struct hppfs_inode_info *hi; ++ ++ hi = kmalloc(sizeof(*hi), GFP_KERNEL); ++ if(hi == NULL) ++ return(NULL); ++ ++ *hi = ((struct hppfs_inode_info) { .proc_dentry = NULL }); ++ inode_init_once(&hi->vfs_inode); ++ return(&hi->vfs_inode); ++} ++ ++void hppfs_delete_inode(struct inode *ino) ++{ ++ clear_inode(ino); ++} ++ ++static void hppfs_destroy_inode(struct inode *inode) ++{ ++ kfree(HPPFS_I(inode)); ++} ++ ++static struct super_operations hppfs_sbops = { ++ .alloc_inode = hppfs_alloc_inode, ++ .destroy_inode = hppfs_destroy_inode, ++ .read_inode = hppfs_read_inode, ++ .delete_inode = hppfs_delete_inode, ++ .statfs = hppfs_statfs, ++}; ++ ++static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen) ++{ ++ struct file proc_file; ++ struct dentry *proc_dentry; ++ int (*readlink)(struct dentry *, char *, int); ++ int err, n; ++ ++ proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; ++ err = open_private_file(&proc_file, proc_dentry, O_RDONLY); ++ if(err) ++ return(err); ++ ++ readlink = proc_dentry->d_inode->i_op->readlink; ++ n = (*readlink)(proc_dentry, buffer, buflen); ++ ++ close_private_file(&proc_file); ++ ++ return(n); ++} ++ ++static int hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) ++{ ++ struct file proc_file; ++ struct dentry *proc_dentry; ++ int (*follow_link)(struct dentry *, struct nameidata *); ++ int err, n; ++ ++ proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; ++ err = open_private_file(&proc_file, proc_dentry, O_RDONLY); ++ if(err) ++ return(err); ++ ++ follow_link = proc_dentry->d_inode->i_op->follow_link; ++ n = (*follow_link)(proc_dentry, nd); ++ ++ close_private_file(&proc_file); ++ ++ return(n); ++} ++ ++static struct inode_operations hppfs_dir_iops = { ++ .lookup = hppfs_lookup, ++}; ++ ++static struct inode_operations hppfs_link_iops = { ++ .readlink = hppfs_readlink, ++ .follow_link = hppfs_follow_link, ++}; ++ ++static int init_inode(struct inode *inode, struct dentry *dentry) ++{ ++ if(S_ISDIR(dentry->d_inode->i_mode)){ ++ inode->i_op = &hppfs_dir_iops; ++ inode->i_fop = &hppfs_dir_fops; ++ } ++ else if(S_ISLNK(dentry->d_inode->i_mode)){ ++ inode->i_op = &hppfs_link_iops; ++ inode->i_fop = &hppfs_file_fops; ++ } ++ else { ++ inode->i_op = &hppfs_file_iops; ++ inode->i_fop = &hppfs_file_fops; ++ } ++ ++ HPPFS_I(inode)->proc_dentry = dentry; ++ ++ return(0); ++} ++ ++static int hppfs_fill_super(struct super_block *sb, void *d, int silent) ++{ ++ struct inode *root_inode; ++ struct file_system_type *procfs; ++ struct super_block *proc_sb; ++ int err; ++ ++ err = -ENOENT; ++ procfs = get_fs_type("proc"); ++ if(procfs == NULL) ++ goto out; ++ ++ if(list_empty(&procfs->fs_supers)) ++ goto out; ++ ++ proc_sb = list_entry(procfs->fs_supers.next, struct super_block, ++ s_instances); ++ ++ sb->s_blocksize = 1024; ++ sb->s_blocksize_bits = 10; ++ sb->s_magic = HPPFS_SUPER_MAGIC; ++ sb->s_op = &hppfs_sbops; ++ ++ root_inode = iget(sb, 0); ++ if(root_inode == NULL) ++ goto out; ++ ++ err = init_inode(root_inode, proc_sb->s_root); ++ if(err) ++ goto out_put; ++ ++ err = -ENOMEM; ++ sb->s_root = d_alloc_root(root_inode); ++ if(sb->s_root == NULL) ++ goto out_put; ++ ++ hppfs_read_inode(root_inode); ++ ++ return(0); ++ ++ out_put: ++ iput(root_inode); ++ out: ++ return(err); ++} ++ ++static struct super_block *hppfs_read_super(struct file_system_type *type, ++ int flags, const char *dev_name, ++ void *data) ++{ ++ return(get_sb_nodev(type, flags, data, hppfs_fill_super)); ++} ++ ++static struct file_system_type hppfs_type = { ++ .owner = THIS_MODULE, ++ .name = "hppfs", ++ .get_sb = hppfs_read_super, ++ .kill_sb = kill_anon_super, ++ .fs_flags = 0, ++}; ++ ++static int __init init_hppfs(void) ++{ ++ return(register_filesystem(&hppfs_type)); ++} ++ ++static void __exit exit_hppfs(void) ++{ ++ unregister_filesystem(&hppfs_type); ++} ++ ++module_init(init_hppfs) ++module_exit(exit_hppfs) ++MODULE_LICENSE("GPL"); ++ ++/* ++ * Overrides for Emacs so that we follow Linus's tabbing style. ++ * Emacs will notice this stuff at the end of the file and automatically ++ * adjust the settings for this buffer only. This must remain at the end ++ * of the file. ++ * --------------------------------------------------------------------------- ++ * Local variables: ++ * c-file-style: "linux" ++ * End: ++ */ + +_ diff -Nru a/patches/patches/uml-tty-init.diff.patch b/patches/patches/uml-tty-init.diff.patch --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/patches/uml-tty-init.diff.patch Fri Oct 31 14:10:55 2003 @@ -0,0 +1,45 @@ + +diff -puN drivers/char/tty_io.c~uml-tty-init.diff drivers/char/tty_io.c +--- limbo/drivers/char/tty_io.c~uml-tty-init.diff Tue Oct 21 16:42:52 2003 ++++ limbo-god/drivers/char/tty_io.c Tue Oct 21 16:42:52 2003 +@@ -2419,12 +2419,17 @@ static struct cdev ptmx_cdev; + static struct cdev vc0_cdev; + #endif + ++static int tty_initialized = 0; ++ + /* + * Ok, now we can initialize the rest of the tty devices and can count + * on memory allocations, interrupts etc.. + */ +-static int __init tty_init(void) ++int __init tty_init(void) + { ++ if (tty_initialized) ++ return 0; ++ tty_initialized = 1; + strcpy(tty_cdev.kobj.name, "dev.tty"); + cdev_init(&tty_cdev, &tty_fops); + if (cdev_add(&tty_cdev, MKDEV(TTYAUX_MAJOR, 0), 1) || +diff -puN arch/um/drivers/stdio_console.c~uml-tty-init.diff arch/um/drivers/stdio_console.c +--- limbo/arch/um/drivers/stdio_console.c~uml-tty-init.diff Tue Oct 21 16:42:52 2003 ++++ limbo-god/arch/um/drivers/stdio_console.c Tue Oct 21 16:42:52 2003 +@@ -169,12 +169,16 @@ static struct tty_operations console_ops + .write_room = line_write_room, + }; + ++extern int tty_init(void); ++ + int stdio_init(void) + { + char *new_title; + + printk(KERN_INFO "Initializing stdio console driver\n"); + ++ tty_init(); ++ + console_driver = line_register_devfs(&console_lines, &driver, + &console_ops, vts, + sizeof(vts)/sizeof(vts[0])); + +_ diff -Nru a/patches/pc/all-sources.diff.pc b/patches/pc/all-sources.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/all-sources.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1 @@ +Makefile diff -Nru a/patches/pc/do_mmap2-fix.diff.pc b/patches/pc/do_mmap2-fix.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/do_mmap2-fix.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,4 @@ +arch/i386/kernel/sys_i386.c +include/linux/mm.h +mm/mmap.c +mm/mprotect.c diff -Nru a/patches/pc/export-generic_forget_inode.diff.pc b/patches/pc/export-generic_forget_inode.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/export-generic_forget_inode.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1,2 @@ +fs/inode.c +include/linux/fs.h diff -Nru a/patches/pc/export-page_cache_readahead.diff.pc b/patches/pc/export-page_cache_readahead.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/export-page_cache_readahead.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +mm/readahead.c diff -Nru a/patches/pc/export-remove_from_page_cache.diff.pc b/patches/pc/export-remove_from_page_cache.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/export-remove_from_page_cache.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +mm/filemap.c diff -Nru a/patches/pc/fs_activation.diff.pc b/patches/pc/fs_activation.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/fs_activation.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1,4 @@ +fs/jbd/transaction.c +include/linux/init_task.h +include/linux/jbd.h +include/linux/sched.h diff -Nru a/patches/pc/fsync_super.diff.pc b/patches/pc/fsync_super.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/fsync_super.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +fs/buffer.c diff -Nru a/patches/pc/i386-sys_reiser4.diff.pc b/patches/pc/i386-sys_reiser4.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/i386-sys_reiser4.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1,2 @@ +arch/i386/kernel/entry.S +include/asm-i386/unistd.h diff -Nru a/patches/pc/init_fixmap_vma.diff.pc b/patches/pc/init_fixmap_vma.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/init_fixmap_vma.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1 @@ +mm/memory.c diff -Nru a/patches/pc/page-owner.diff.pc b/patches/pc/page-owner.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/page-owner.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1,2 @@ +include/linux/mm.h +include/linux/page-flags.h diff -Nru a/patches/pc/reget-page-mapping.diff.pc b/patches/pc/reget-page-mapping.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/reget-page-mapping.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,2 @@ +mm/vmscan.c +mm/truncate.c diff -Nru a/patches/pc/reiser4-fs-Kconfig.diff.pc b/patches/pc/reiser4-fs-Kconfig.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/reiser4-fs-Kconfig.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +fs/Kconfig diff -Nru a/patches/pc/reiser4-fs-Makefile.diff.pc b/patches/pc/reiser4-fs-Makefile.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/reiser4-fs-Makefile.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1 @@ +fs/Makefile diff -Nru a/patches/pc/sb_sync_inodes.diff.pc b/patches/pc/sb_sync_inodes.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/sb_sync_inodes.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1,3 @@ +fs/fs-writeback.c +include/linux/writeback.h +include/linux/fs.h diff -Nru a/patches/pc/spinlock-owner.diff.pc b/patches/pc/spinlock-owner.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/spinlock-owner.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,3 @@ +include/asm-i386/spinlock.h +kernel/sched.c +include/linux/spinlock.h diff -Nru a/patches/pc/static-inline-quotaops.diff.pc b/patches/pc/static-inline-quotaops.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/static-inline-quotaops.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +include/linux/quotaops.h diff -Nru a/patches/pc/truncate_mapping_pages_range.diff.pc b/patches/pc/truncate_mapping_pages_range.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/truncate_mapping_pages_range.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,2 @@ +include/linux/mm.h +mm/truncate.c diff -Nru a/patches/pc/uml-AUTOCONF_INCLUDED.diff.pc b/patches/pc/uml-AUTOCONF_INCLUDED.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-AUTOCONF_INCLUDED.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +arch/um/kernel/user_syms.c diff -Nru a/patches/pc/uml-asm-cpufeature-h.diff.pc b/patches/pc/uml-asm-cpufeature-h.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-asm-cpufeature-h.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +include/asm-um/cpufeature.h diff -Nru a/patches/pc/uml-asm-local-h.diff.pc b/patches/pc/uml-asm-local-h.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-asm-local-h.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +include/asm-um/local.h diff -Nru a/patches/pc/uml-asm-module-i386.h.diff.pc b/patches/pc/uml-asm-module-i386.h.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-asm-module-i386.h.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +include/asm-um/module-i386.h diff -Nru a/patches/pc/uml-asm-sections.diff.pc b/patches/pc/uml-asm-sections.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-asm-sections.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1 @@ +include/asm-um/sections.h diff -Nru a/patches/pc/uml-export-in-ksyms.c.diff.pc b/patches/pc/uml-export-in-ksyms.c.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-export-in-ksyms.c.diff.pc Fri Oct 31 14:10:55 2003 @@ -0,0 +1,2 @@ +arch/um/kernel/ksyms.c +arch/um/kernel/time.c diff -Nru a/patches/pc/uml-kill-cow.diff.pc b/patches/pc/uml-kill-cow.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-kill-cow.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,3 @@ +arch/um/Kconfig_block +arch/um/defconfig +arch/um/drivers/ubd_user.c diff -Nru a/patches/pc/uml-kill-irq_kern.h.diff.pc b/patches/pc/uml-kill-irq_kern.h.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-kill-irq_kern.h.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,8 @@ +arch/um/drivers/line.c +arch/um/drivers/mconsole_kern.c +arch/um/drivers/net_kern.c +arch/um/drivers/port_kern.c +arch/um/drivers/ubd_kern.c +arch/um/drivers/xterm_kern.c +arch/um/kernel/irq.c +arch/um/kernel/sigio_kern.c diff -Nru a/patches/pc/uml-sched_clock.diff.pc b/patches/pc/uml-sched_clock.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-sched_clock.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1 @@ +arch/um/kernel/time_kern.c diff -Nru a/patches/pc/uml-summa.diff.pc b/patches/pc/uml-summa.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-summa.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,104 @@ +arch/um/Kconfig +arch/um/Kconfig_block +arch/um/Kconfig_net +arch/um/Makefile +arch/um/Makefile-i386 +arch/um/Makefile-skas +arch/um/config.release +arch/um/defconfig +arch/um/drivers/Makefile +arch/um/drivers/chan_kern.c +arch/um/drivers/chan_user.c +arch/um/drivers/hostaudio_kern.c +arch/um/drivers/line.c +arch/um/drivers/mconsole_kern.c +arch/um/drivers/mconsole_user.c +arch/um/drivers/mmapper_kern.c +arch/um/drivers/net_kern.c +arch/um/drivers/port_kern.c +arch/um/drivers/ssl.c +arch/um/drivers/stdio_console.c +arch/um/drivers/ubd_kern.c +arch/um/drivers/ubd_user.c +arch/um/drivers/xterm.c +arch/um/drivers/xterm_kern.c +arch/um/dyn.lds.S +arch/um/include/kern_util.h +arch/um/include/line.h +arch/um/include/mconsole.h +arch/um/include/mem.h +arch/um/include/mem_user.h +arch/um/include/os.h +arch/um/include/sysdep-i386/checksum.h +arch/um/include/sysdep-i386/sigcontext.h +arch/um/include/time_user.h +arch/um/include/ubd_user.h +arch/um/include/user.h +arch/um/include/user_util.h +arch/um/kernel/Makefile +arch/um/kernel/config.c.in +arch/um/kernel/exec_kern.c +arch/um/kernel/init_task.c +arch/um/kernel/irq.c +arch/um/kernel/mem.c +arch/um/kernel/mem_user.c +arch/um/kernel/process.c +arch/um/kernel/process_kern.c +arch/um/kernel/ptrace.c +arch/um/kernel/sigio_kern.c +arch/um/kernel/signal_kern.c +arch/um/kernel/skas/Makefile +arch/um/kernel/skas/include/mode.h +arch/um/kernel/skas/include/uaccess.h +arch/um/kernel/skas/process.c +arch/um/kernel/skas/process_kern.c +arch/um/kernel/skas/util/mk_ptregs.c +arch/um/kernel/smp.c +arch/um/kernel/sys_call_table.c +arch/um/kernel/syscall_kern.c +arch/um/kernel/sysrq.c +arch/um/kernel/time.c +arch/um/kernel/time_kern.c +arch/um/kernel/trap_kern.c +arch/um/kernel/trap_user.c +arch/um/kernel/tt/exec_kern.c +arch/um/kernel/tt/include/uaccess.h +arch/um/kernel/tt/process_kern.c +arch/um/kernel/tt/ptproxy/proxy.c +arch/um/kernel/tt/tlb.c +arch/um/kernel/tt/tracer.c +arch/um/kernel/tt/uaccess_user.c +arch/um/kernel/tty_log.c +arch/um/kernel/uaccess_user.c +arch/um/kernel/um_arch.c +arch/um/kernel/umid.c +arch/um/kernel/user_util.c +arch/um/os-Linux/drivers/tuntap_user.c +arch/um/os-Linux/file.c +arch/um/sys-i386/Makefile +arch/um/sys-i386/bugs.c +arch/um/sys-i386/fault.c +arch/um/uml.lds.S +arch/um/util/mk_constants_kern.c +fs/Makefile +include/asm-um/archparam-i386.h +include/asm-um/bug.h +include/asm-um/common.lds.S +include/asm-um/current.h +include/asm-um/fixmap.h +include/asm-um/irq.h +include/asm-um/page.h +include/asm-um/pgtable.h +include/asm-um/processor-generic.h +include/asm-um/processor-i386.h +include/asm-um/smp.h +include/asm-um/system-generic.h +include/asm-um/thread_info.h +include/asm-um/timex.h +mm/Makefile +fs/hostfs/Makefile +fs/hostfs/hostfs.h +fs/hostfs/hostfs_kern.c +fs/hostfs/hostfs_user.c +fs/hppfs/Makefile +fs/hppfs/hppfs_kern.c diff -Nru a/patches/pc/uml-tty-init.diff.pc b/patches/pc/uml-tty-init.diff.pc --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/pc/uml-tty-init.diff.pc Fri Oct 31 14:10:54 2003 @@ -0,0 +1,2 @@ +drivers/char/tty_io.c +arch/um/drivers/stdio_console.c diff -Nru a/patches/series b/patches/series --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/series Fri Oct 31 14:10:54 2003 @@ -0,0 +1,28 @@ +all-sources.diff.patch +i386-sys_reiser4.diff.patch +do_mmap2-fix.diff.patch +uml-summa.diff.patch +reiser4-fs-Makefile.diff.patch +fsync_super.diff.patch +reiser4-fs-Kconfig.diff.patch +sb_sync_inodes.diff.patch +export-generic_forget_inode.diff.patch +spinlock-owner.diff.patch +truncate_mapping_pages_range.diff.patch +page-owner.diff.patch +init_fixmap_vma.diff.patch +export-remove_from_page_cache.diff.patch +export-page_cache_readahead.diff.patch +fs_activation.diff.patch +static-inline-quotaops.diff.patch +uml-asm-cpufeature-h.diff.patch +uml-asm-local-h.diff.patch +uml-kill-irq_kern.h.diff.patch +uml-export-in-ksyms.c.diff.patch +uml-sched_clock.diff.patch +uml-AUTOCONF_INCLUDED.diff.patch +uml-tty-init.diff.patch +uml-kill-cow.diff.patch +uml-asm-sections.diff.patch +uml-asm-module-i386.h.diff.patch +reget-page-mapping.diff.patch diff -Nru a/patches/txt/reget-page-mapping.diff.txt b/patches/txt/reget-page-mapping.diff.txt --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/patches/txt/reget-page-mapping.diff.txt Fri Oct 31 14:10:54 2003 @@ -0,0 +1,17 @@ +reiser4 keeps its meta-data pages in the page cache, attached to a special +"fake" inode. Meta-data blocks have "znodes" attached to them (reiser4 analog +of buffer_head) and initially don't have real disk block numbers +assigned. Later meta-data blocks can be "relocated" to decrease +fragmentation. As a result, their pages cannot be easily indexed by block +number. Instead reiser4 indexes pages of fake inode by some function of znode +address. This looks weird, but it works. The only problem is that there is a +race involving ->releasepage(): there is a window when znode has already been +freed by reiser4_releasepage(), but its page still exists (albeit locked). If +at this moment another znode is allocated at the same address as one just +destroyed, then some other thread can acquire a reference to lingering page +(because it is indexed by address of znode), and prevent shrink_list() from +freeing it. + +To avoid this, reiser4_releasepage() removes page from radix-tree +manually. This requires re-checking page->mapping after calling +try_to_release_page().