diff -urN a/CREDITS b/CREDITS --- a/CREDITS +++ b/CREDITS @@ -52,6 +52,12 @@ S: Buenos Aires S: Argentina +A: Dan Aloni +E: da-x@colinux.org +D: Cooperative Linux +D: Various kernel patches +S: Israel + N: Tim Alpaerts E: tim_alpaerts@toyota-motor-europe.com D: 802.2 class II logical link control layer, diff -urN a/Makefile b/Makefile --- a/Makefile +++ b/Makefile @@ -319,6 +319,11 @@ AS = $(CROSS_COMPILE)as LD = $(CROSS_COMPILE)ld CC = $(CROSS_COMPILE)gcc +ifeq ($(GCCTRACE),Y) +CC = $(CORSS_COMPILE)$(COLINUX_ROOT)/bin/tracewrapper.py gcc +else +CC = $(CROSS_COMPILE)gcc +endif CPP = $(CC) -E AR = $(CROSS_COMPILE)ar NM = $(CROSS_COMPILE)nm diff -urN a/arch/i386/Kconfig b/arch/i386/Kconfig --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -205,6 +205,7 @@ config M586TSC bool "Pentium-Classic" + depends on !COOPERATIVE help Select this for a Pentium Classic processor with the RDTSC (Read Time Stamp Counter) instruction for benchmarking. @@ -543,6 +544,10 @@ If you have a system with several CPUs, you do not need to say Y here: the IO-APIC will be used automatically. +config X86_UP_COPIC + bool 'Cooperative PIC (COPIC) support' + depends on COOPERATIVE + config X86_LOCAL_APIC bool depends on !SMP && X86_UP_APIC @@ -555,7 +560,7 @@ config X86_TSC bool - depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2) && !X86_NUMAQ + depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2) && !X86_NUMAQ && !COOPERATIVE default y config X86_MCE @@ -882,6 +887,10 @@ source kernel/power/Kconfig +config COOPERATIVE + bool 'Cooperative Mode' + default y + source "drivers/acpi/Kconfig" menu "APM (Advanced Power Management) BIOS Support" diff -urN a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -17,6 +17,7 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_COOPERATIVE) += cooperative.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o diff -urN a/arch/i386/kernel/cooperative.c b/arch/i386/kernel/cooperative.c --- a/arch/i386/kernel/cooperative.c +++ b/arch/i386/kernel/cooperative.c @@ -0,0 +1,340 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +CO_TRACE_STOP; + + +/* + * The next asm code is the first Linux code that runs in the + * coLinux kernel context. It receives %ecx which contains the + * address of the passage page. The passage page code sets %ecx + * to this value in its context restore part. + */ + +asm( + "" + ".section .text\n" + ".globl co_start\n" + "co_start:\n" + " call co_start_arch\n" + ".previous\n" + ""); + +static int co_passage_page_holding_count = 0; + +static void co_early_cpu_init(void) +{ + /* + * On the first switch to Linux we must set up a valid TR because + * the passage page code assumes such one exists. This is basically + * copied code from cpu_init(). + * + * P.S this is protected by CO_TRACE_STOP so that we don't + * have a monitor context switch. + */ + int cpu = smp_processor_id(); + struct tss_struct * t = &per_cpu(init_tss, cpu); + struct thread_struct *thread = ¤t->thread; + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table, GDT_SIZE); + cpu_gdt_descr[cpu].size = GDT_SIZE - 1; + cpu_gdt_descr[cpu].address = (unsigned long)&per_cpu(cpu_gdt_table, cpu); + + /* + * Set up the per-thread TLS descriptor cache: + */ + memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu), GDT_ENTRY_TLS_ENTRIES * 8); + + __asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu])); + __asm__ __volatile__("lidt %0" : : "m" (idt_descr)); + + /* + * Delete NT + */ + __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl"); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + current->active_mm = &init_mm; + enter_lazy_tlb(&init_mm, current); + + load_esp0(t, thread); + set_tss_desc(cpu,t); + per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_TSS].b &= 0xfffffdff; + + load_TR_desc(); + + load_LDT(&init_mm.context); + + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); + per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff; + + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + __asm__ __volatile__("movl %%cr4, %0" : "=r" (mmu_cr4_features)); +} + +asm( + "" + ".section .text\n" + ".globl co_arch_start_kernel\n" + "co_arch_start_kernel:\n" + " call co_startup_entry\n" + ".previous\n" + ""); + +void co_start_arch(void) +{ + co_early_cpu_init(); + co_start_kernel(); +} + +extern void ctrl_alt_del(void); + +void co_handle_jiffies(long count) +{ + unsigned long flags; + struct pt_regs regs; + + if (count > HZ) { + xtime.tv_sec += count / HZ; + count -= ((count / HZ) * HZ); + } + + while (count > 0) { + local_irq_save(flags); + regs.orig_eax = TIMER_IRQ; + do_IRQ(®s); + local_irq_restore(flags); + + count--; + } +} + +void co_handle_incoming_message(co_message_node_t *node_message) +{ + unsigned long flags; + struct pt_regs regs; + co_linux_message_t *message; + + message = (co_linux_message_t *)&node_message->msg.data; + + switch (message->device) { + case CO_DEVICE_POWER: { + co_linux_message_power_t *type = (co_linux_message_power_t *)message->data; + switch (type->type) { + case CO_LINUX_MESSAGE_POWER_ALT_CTRL_DEL: { + ctrl_alt_del(); + break; + } + } + co_free_message(node_message); + break; + } + + case CO_DEVICE_KEYBOARD: { + co_queue_incoming_message(node_message); + + local_irq_save(flags); + regs.orig_eax = KEYBOARD_IRQ; + do_IRQ(®s); + local_irq_restore(flags); + break; + } + + case CO_DEVICE_NETWORK: { + co_queue_incoming_message(node_message); + + local_irq_save(flags); + regs.orig_eax = NETWORK_IRQ; + do_IRQ(®s); + local_irq_restore(flags); + break; + } + + case CO_DEVICE_SERIAL: { + co_queue_incoming_message(node_message); + + local_irq_save(flags); + cocd_interrupt(); + local_irq_restore(flags); + break; + } + + default: + co_free_message(node_message); + break; + } +} + +void co_switch_wrapper_protected(void) +{ + kernel_fpu_begin(); + + /* + * We don't trust the passage page code to safely restore %gs and %fs. + * + * This wrapper ensures that if %fs or %gs are invalid, the processes + * exits with a segmentation fault rather than bringing down the + * machine. + **/ + unsigned long fs = 0; + unsigned long gs = 0; + + asm volatile("movl %%fs,%0": "=m" (fs)); + asm volatile("movl %%gs,%0": "=m" (gs)); + + /* + * Nullify the registers so the passage page code restores to + * null segment values on return. + */ + asm volatile("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); + + /* And switch... */ + co_switch(); + + /* + * Safely restore the registers. + */ + loadsegment(fs, fs); + loadsegment(gs, gs); + + kernel_fpu_end(); +} + +void co_switch_wrapper(void) +{ + /* taken from irq.c: debugging check for stack overflow */ + long esp; + + __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + printk("co_switch_wrapper: stack overflow: %ld\n", esp - sizeof(struct thread_info)); + co_terminate(CO_TERMINATE_STACK_OVERFLOW); + } + + co_switch_wrapper_protected(); +} + +void co_passage_page_acquire(unsigned long *flags) +{ + local_irq_save(*flags); + co_passage_page_holding_count++; +} + +int co_passage_page_held(void) +{ + return co_passage_page_holding_count; +} + +void co_passage_page_release(unsigned long flags) +{ + co_passage_page_holding_count--; + local_irq_restore(flags); +} + +void co_debug(const char *fmt, ...) +{ +} + +#define MAX_TRACE_POINTS 1024 + +typedef struct { + unsigned char *code; + unsigned char original_byte; + int off; +} co_tracepoint_t; + +co_tracepoint_t tracepoints[MAX_TRACE_POINTS]; +static int active_tracepoints = 0; + +void co_kernel_breakpoint(struct pt_regs * regs) +{ + int i = 0; + unsigned char *code = (unsigned char *)regs->eip; + if (!code) + return; + + for (i=0; i < active_tracepoints; i++) { + if (tracepoints[i].code == code - 1) { + co_debug("TRACEPOINT: %x\n", code - 1); + break; + } + } + + if (i == active_tracepoints) { + /* Bad, we don't know this tracepoint */ + co_terminate(CO_TERMINATE_INVALID_OPERATION); + return; + } + + *tracepoints[i].code = tracepoints[i].original_byte; + regs->eflags |= (1 << 8); /* Enable TF */ + regs->eip = (unsigned long)(code - 1); + tracepoints[i].off = 1; +} + +void co_kernel_set_breakpoints(void) +{ + int i; + + for (i=0; i < active_tracepoints; i++) + if (tracepoints[i].code && tracepoints[i].off) { + *tracepoints[i].code = 0xcc; + tracepoints[i].off = 0; + } +} + +int co_kernel_debug(struct pt_regs *regs, long error_code, unsigned int condition) +{ + /* if not a single step trap */ + if (!(condition & DR_STEP)) + return 0; + + /* if userspace */ + if (regs->xcs & 3) + return 0; + + regs->eflags &= ~(1 << 8); /* Disable TF */ + + co_kernel_set_breakpoints(); + + return 1; +} + +void co_kernel_tracepoint_add(unsigned char *code) +{ + if (active_tracepoints >= MAX_TRACE_POINTS) + return; + + tracepoints[active_tracepoints].code = code; + tracepoints[active_tracepoints].original_byte = *code; + tracepoints[active_tracepoints].off = 0; + active_tracepoints++; + *code = 0xcc; +} + +co_arch_info_t co_arch_info = { + .kernel_cs = __KERNEL_CS, + .kernel_ds = __KERNEL_DS, +}; + +CO_TRACE_CONTINUE; diff -urN a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -570,11 +571,13 @@ /* Clear all 6 debug registers: */ + if (!cooperative_mode_enabled()) { #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); - CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); + CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); #undef CD + } /* * Force FPU initialization: diff -urN a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -158,7 +158,7 @@ ALIGN ret_from_exception: preempt_stop -ret_from_intr: +ENTRY(ret_from_intr) GET_THREAD_INFO(%ebp) movl EFLAGS(%esp), %eax # mix EFLAGS and CS movb CS(%esp), %al diff -urN a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -238,6 +238,7 @@ rep movsl 1: +ENTRY(co_startup_entry) checkCPUtype: movl $-1,X86_CPUID # -1 for no CPUID initially @@ -425,7 +426,7 @@ .data ENTRY(stack_start) - .long init_thread_union+THREAD_SIZE + .long init_thread_union+THREAD_SIZE-100 .long __BOOT_DS ready: .byte 0 diff -urN a/arch/i386/kernel/i387.c b/arch/i386/kernel/i387.c --- a/arch/i386/kernel/i387.c +++ b/arch/i386/kernel/i387.c @@ -17,6 +17,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #define HAVE_HWFP (boot_cpu_data.hard_math) @@ -37,6 +38,10 @@ if (mask == 0) mask = 0x0000ffbf; } mxcsr_feature_mask &= mask; + + if (cooperative_mode_enabled()) + return; + stts(); } @@ -386,6 +391,7 @@ return err; } + /* * ptrace request handlers. */ diff -urN a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -26,9 +26,89 @@ #include #include +#include #include +#ifdef CONFIG_COOPERATIVE + +CO_TRACE_STOP; + +void proxy_interrupt_handler(unsigned long interrupt, struct pt_regs regs) +{ + unsigned long flags; + + co_passage_page_acquire(&flags); + co_passage_page->operation = CO_OPERATION_FORWARD_INTERRUPT; + co_passage_page->params[0] = interrupt + 0x20; + co_passage_page->params[1] = regs.eip; + co_passage_page->params[2] = (unsigned long)(&((&interrupt)[10])); + co_passage_page->host_state.flags &= ~(1 << 9); /* Turn IF off */ + co_switch_wrapper(); + co_callback(flags); +} + +CO_TRACE_CONTINUE; + +#define IRQLIST_16(x) \ + IRQ(x,0) IRQ(x,1) IRQ(x,2) IRQ(x,3) \ + IRQ(x,4) IRQ(x,5) IRQ(x,6) IRQ(x,7) \ + IRQ(x,8) IRQ(x,9) IRQ(x,a) IRQ(x,b) \ + IRQ(x,c) IRQ(x,d) IRQ(x,e) IRQ(x,f) + +#define IRQLIST_224 \ + IRQLIST_16(0x0) IRQLIST_16(0x1) IRQLIST_16(0x2) IRQLIST_16(0x3) \ + IRQLIST_16(0x4) IRQLIST_16(0x5) IRQLIST_16(0x6) IRQLIST_16(0x7) \ + IRQLIST_16(0x8) IRQLIST_16(0x9) IRQLIST_16(0xa) IRQLIST_16(0xb) \ + IRQLIST_16(0xc) IRQLIST_16(0xd) + +#define IRQ(x,y) \ + extern asmlinkage void IRQ_proxy_##x##y##_interrupt(void); +IRQLIST_224; +#undef IRQ + +#define BIRQ(id) \ +asm( \ + "\n"__ALIGN_STR"\n" \ + ".section .text\n" \ + ".globl IRQ_proxy_" #id "_interrupt\n" \ + "IRQ_proxy_" #id "_interrupt:\n" \ + "push %eax\n\t" \ + "cld;\n\t" \ + "pushl %es;\n\t" \ + "pushl %ds;\n\t" \ + "pushl %eax;\n\t" \ + "pushl %ebp;\n\t" \ + "pushl %edi;\n\t" \ + "pushl %esi;\n\t" \ + "pushl %edx;\n\t" \ + "pushl %ecx;\n\t" \ + "pushl %ebx;\n\t" \ + "movl $123, %edx;\n\t" \ + "movl %edx, %ds;\n\t" \ + "movl %edx, %es;\n\t" \ + "pushl $" #id "\n\t" \ + "call proxy_interrupt_handler\n\t" \ + "popl %ebx\n\t" \ + "jmp ret_from_intr\n" \ + ".previous\n" \ + ); \ + +#define IRQ(x,y) BIRQ(x##y) +IRQLIST_224; +#undef IRQ + +#define IRQ(x,y) &IRQ_proxy_##x##y##_interrupt, +void (*proxy_interrupt[NR_IRQS])(void) = { + IRQLIST_224 +}; +#undef IRQ + +#undef IRQLIST_16 +#undef IRQLIST_224 + +#endif + /* * This is the 'legacy' 8259A Programmable Interrupt Controller, * present in the majority of PC/AT boxes. @@ -364,6 +444,9 @@ { int i; + if (cooperative_mode_enabled()) + return; + #ifdef CONFIG_X86_LOCAL_APIC init_bsp_APIC(); #endif @@ -388,6 +471,65 @@ } } +#ifdef CONFIG_X86_UP_COPIC + +/* + * Not like you have any other choice other than using + * COPIC in Cooperative mode. + */ + +static void end_COPIC_irq(unsigned int irq) +{ +} + +#define shutdown_COPIC_irq disable_COPIC_irq + +static void mask_and_ack_COPIC(unsigned int irq) +{ +} + +static unsigned int startup_COPIC_irq(unsigned int irq) +{ + return 0; +} + +void disable_COPIC_irq(unsigned int irq) +{ +} + +void enable_COPIC_irq(unsigned int irq) +{ +} + +static struct hw_interrupt_type co_pic_irq_type = { + "CO-PIC", + startup_COPIC_irq, + shutdown_COPIC_irq, + enable_COPIC_irq, + disable_COPIC_irq, + mask_and_ack_COPIC, + end_COPIC_irq, + NULL +}; + +void __init init_COPIC_irqs(void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = 0; + irq_desc[i].depth = 1; + + irq_desc[i].handler = &co_pic_irq_type; + } + +} + +#else +#define init_COPIC_irqs() do {} while (0); +#endif + void __init init_IRQ(void) { int i; @@ -395,6 +537,22 @@ /* all the set up before the call gates are initialised */ pre_intr_init_hook(); + if (cooperative_mode_enabled()) { + printk("Setting proxy interrupt vectors\n"); + + init_COPIC_irqs(); + + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, proxy_interrupt[i]); + } + + return; + } + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become diff -urN a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -15,6 +15,7 @@ #include #include #include +#include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) @@ -61,6 +62,9 @@ struct tss_struct * tss; unsigned long *bitmap; + if (cooperative_mode_enabled()) + return -EPERM; + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; if (turn_on && !capable(CAP_SYS_RAWIO)) @@ -133,6 +137,9 @@ unsigned int level = regs->ebx; unsigned int old = (regs->eflags >> 12) & 3; + if (cooperative_mode_enabled()) + return -EPERM; + if (level > 3) return -EINVAL; /* Trying to gain more privileges? */ diff -urN a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -52,6 +52,7 @@ #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -147,21 +148,24 @@ /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { - void (*idle)(void); - /* - * Mark this as an RCU critical section so that - * synchronize_kernel() in the unload path waits - * for our completion. - */ - rcu_read_lock(); - idle = pm_idle; + void (*idle)(void) = pm_idle; + + /* + * Mark this as an RCU critical section so that + * synchronize_kernel() in the unload path waits + * for our completion. + */ + rcu_read_lock(); + + if (cooperative_mode_enabled()) + idle = co_idle_processor; if (!idle) idle = default_idle; irq_stat[smp_processor_id()].idle_timestamp = jiffies; idle(); - rcu_read_unlock(); + rcu_read_unlock(); } schedule(); } diff -urN a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -13,6 +13,7 @@ #include #include #include "mach_reboot.h" +#include /* * Power off function, if any @@ -217,6 +218,11 @@ { unsigned long flags; + if (cooperative_mode_enabled()) { + co_terminate(CO_TERMINATE_REBOOT); + return; + } + local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -332,8 +338,13 @@ */ smp_send_stop(); #endif /* CONFIG_SMP */ - + lapic_shutdown(); + + if (cooperative_mode_enabled()) { + co_terminate(CO_TERMINATE_REBOOT); + return; + } #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); @@ -364,12 +375,18 @@ void machine_halt(void) { + co_terminate(CO_TERMINATE_HALT); } EXPORT_SYMBOL(machine_halt); void machine_power_off(void) { + if (cooperative_mode_enabled()) { + co_terminate(CO_TERMINATE_POWEROFF); + return; + } + lapic_shutdown(); if (efi_enabled) diff -urN a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -39,6 +39,7 @@ #include #include #include +#include #include