From a192cd0413b71c2a3e4e48dd365af704be72b748 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 13:26:37 -0400 Subject: [PATCH 1/9] ftrace: Synchronize variable setting with breakpoints When the function tracer starts modifying the code via breakpoints it sets a variable (modifying_ftrace_code) to inform the breakpoint handler to call the ftrace int3 code. But there's no synchronization between setting this code and the handler, thus it is possible for the handler to be called on another CPU before it sees the variable. This will cause a kernel crash as the int3 handler will not know what to do with it. I originally added smp_mb()'s to force the visibility of the variable but H. Peter Anvin suggested that I just make it atomic. [ Added comments as suggested by Peter Zijlstra ] Suggested-by: H. Peter Anvin Signed-off-by: Steven Rostedt --- arch/x86/include/asm/ftrace.h | 2 +- arch/x86/kernel/ftrace.c | 38 ++++++++++++++++++++++++++++++++--- arch/x86/kernel/traps.c | 8 ++++++-- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 18d9005d9e4f..b0767bc08740 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -34,7 +34,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); -extern int modifying_ftrace_code; +extern atomic_t modifying_ftrace_code; static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 32ff36596ab1..2407a6d81cb7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -168,7 +168,38 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } -int modifying_ftrace_code __read_mostly; +/* + * The modifying_ftrace_code is used to tell the breakpoint + * handler to call ftrace_int3_handler(). If it fails to + * call this handler for a breakpoint added by ftrace, then + * the kernel may crash. + * + * As atomic_writes on x86 do not need a barrier, we do not + * need to add smp_mb()s for this to work. It is also considered + * that we can not read the modifying_ftrace_code before + * executing the breakpoint. That would be quite remarkable if + * it could do that. Here's the flow that is required: + * + * CPU-0 CPU-1 + * + * atomic_inc(mfc); + * write int3s + * // implicit (r)mb + * if (atomic_read(mfc)) + * call ftrace_int3_handler() + * + * Then when we are finished: + * + * atomic_dec(mfc); + * + * If we hit a breakpoint that was not set by ftrace, it does not + * matter if ftrace_int3_handler() is called or not. It will + * simply be ignored. But it is crucial that a ftrace nop/caller + * breakpoint is handled. No other user should ever place a + * breakpoint on an ftrace nop/caller location. It must only + * be done by this code. + */ +atomic_t modifying_ftrace_code __read_mostly; /* * A breakpoint was added to the code address we are about to @@ -491,11 +522,12 @@ void ftrace_replace_code(int enable) void arch_ftrace_update_code(int command) { - modifying_ftrace_code++; + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); ftrace_modify_all_code(command); - modifying_ftrace_code--; + atomic_dec(&modifying_ftrace_code); } int __init ftrace_dyn_arch_init(void *data) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ff08457a025d..05b31d92f69c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -303,8 +303,12 @@ gp_in_kernel: dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_DYNAMIC_FTRACE - /* ftrace must be first, everything else may cause a recursive crash */ - if (unlikely(modifying_ftrace_code) && ftrace_int3_handler(regs)) + /* + * ftrace must be first, everything else may cause a recursive crash. + * See note by declaration of modifying_ftrace_code in ftrace.c + */ + if (unlikely(atomic_read(&modifying_ftrace_code)) && + ftrace_int3_handler(regs)) return; #endif #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP From 8a4d0a687a599f39b7df3fe15f2d51d2157caf44 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 13:36:38 -0400 Subject: [PATCH 2/9] ftrace: Use breakpoint method to update ftrace caller On boot up and module load, it is fine to modify the code directly, without the use of breakpoints. This is because boot up modification is done before SMP is initialized, thus the modification is serial, and module load is done before the module executes. But after that we must use a SMP safe method to modify running code. Otherwise, if we are running the function tracer and update its function (by starting off the stack tracer, or perf tracing) the change of the function called by the ftrace trampoline is done directly. If this is being executed on another CPU, that CPU may take a GPF and crash the kernel. The breakpoint method is used to change the nops at all the functions, but the change of the ftrace callback handler itself was still using a direct modification. If tracing was enabled and the function callback was changed then another CPU could fault if it was currently calling the original callback. This modification must use the breakpoint method too. Note, the direct method is still used for boot up and module load. Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 88 ++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2407a6d81cb7..c3a7cb4bf6e6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -100,7 +100,7 @@ static const unsigned char *ftrace_nop_replace(void) } static int -ftrace_modify_code(unsigned long ip, unsigned const char *old_code, +ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -141,7 +141,20 @@ int ftrace_make_nop(struct module *mod, old = ftrace_call_replace(ip, addr); new = ftrace_nop_replace(); - return ftrace_modify_code(rec->ip, old, new); + /* + * On boot up, and when modules are loaded, the MCOUNT_ADDR + * is converted to a nop, and will never become MCOUNT_ADDR + * again. This code is either running before SMP (on boot up) + * or before the code will ever be executed (module load). + * We do not want to use the breakpoint version in this case, + * just modify the code directly. + */ + if (addr == MCOUNT_ADDR) + return ftrace_modify_code_direct(rec->ip, old, new); + + /* Normal cases use add_brk_on_nop */ + WARN_ONCE(1, "invalid use of ftrace_make_nop"); + return -EINVAL; } int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) @@ -152,20 +165,8 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) old = ftrace_nop_replace(); new = ftrace_call_replace(ip, addr); - return ftrace_modify_code(rec->ip, old, new); -} - -int ftrace_update_ftrace_func(ftrace_func_t func) -{ - unsigned long ip = (unsigned long)(&ftrace_call); - unsigned char old[MCOUNT_INSN_SIZE], *new; - int ret; - - memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); - new = ftrace_call_replace(ip, (unsigned long)func); - ret = ftrace_modify_code(ip, old, new); - - return ret; + /* Should only be called when module is loaded */ + return ftrace_modify_code_direct(rec->ip, old, new); } /* @@ -201,6 +202,29 @@ int ftrace_update_ftrace_func(ftrace_func_t func) */ atomic_t modifying_ftrace_code __read_mostly; +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code); + +int ftrace_update_ftrace_func(ftrace_func_t func) +{ + unsigned long ip = (unsigned long)(&ftrace_call); + unsigned char old[MCOUNT_INSN_SIZE], *new; + int ret; + + memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE); + new = ftrace_call_replace(ip, (unsigned long)func); + + /* See comment above by declaration of modifying_ftrace_code */ + atomic_inc(&modifying_ftrace_code); + + ret = ftrace_modify_code(ip, old, new); + + atomic_dec(&modifying_ftrace_code); + + return ret; +} + /* * A breakpoint was added to the code address we are about to * modify, and this is the handle that will just skip over it. @@ -520,6 +544,38 @@ void ftrace_replace_code(int enable) } } +static int +ftrace_modify_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + int ret; + + ret = add_break(ip, old_code); + if (ret) + goto out; + + run_sync(); + + ret = add_update_code(ip, new_code); + if (ret) + goto fail_update; + + run_sync(); + + ret = ftrace_write(ip, new_code, 1); + if (ret) { + ret = -EPERM; + goto out; + } + run_sync(); + out: + return ret; + + fail_update: + probe_kernel_write((void *)ip, &old_code[0], 1); + goto out; +} + void arch_ftrace_update_code(int command) { /* See comment above by declaration of modifying_ftrace_code */ From c0525a6972d3f1fb83058ef503e183475d6e4e26 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 11:43:19 -0400 Subject: [PATCH 3/9] x86: Reset the debug_stack update counter When an NMI goes off and it sees that it preempted the debug stack, to keep the debug stack safe, it changes the IDT to point to one that does not modify the stack on breakpoint (to allow breakpoints in NMIs). But the variable that gets set to know to undo it on exit never gets cleared on exit. Thus every NMI will reset it on exit the first time it is done even if it does not need to be reset. [ Added H. Peter Anvin's suggestion to use this_cpu_read/write ] Cc: # v3.3 Signed-off-by: Steven Rostedt --- arch/x86/kernel/nmi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 90875279ef3d..a0b2f84457be 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -444,14 +444,16 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs) */ if (unlikely(is_debug_stack(regs->sp))) { debug_stack_set_zero(); - __get_cpu_var(update_debug_stack) = 1; + this_cpu_write(update_debug_stack, 1); } } static inline void nmi_nesting_postprocess(void) { - if (unlikely(__get_cpu_var(update_debug_stack))) + if (unlikely(this_cpu_read(update_debug_stack))) { debug_stack_reset(); + this_cpu_write(update_debug_stack, 0); + } } #endif From f8988175fd70874d1fb3712b1c5d3bfc6d455202 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 11:47:00 -0400 Subject: [PATCH 4/9] x86: Allow nesting of the debug stack IDT setting When the NMI handler runs, it checks if it preempted a debug handler and if that handler is using the debug stack. If it is, it changes the IDT table not to update the stack, otherwise it will reset the debug stack and corrupt the debug handler it preempted. Now that ftrace uses breakpoints to change functions from nops to callers, many more places may hit a breakpoint. Unfortunately this includes some of the calls that lockdep performs. Which causes issues with the debug stack. It too needs to change the debug stack before tracing (if called from the debug handler). Allow the debug_stack_set_zero() and debug_stack_reset() to be nested so that the debug handlers can take advantage of them too. [ Used this_cpu_*() over __get_cpu_var() as suggested by H. Peter Anvin ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/cpu/common.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 82f29e70d058..6b9333b429ba 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1101,14 +1101,20 @@ int is_debug_stack(unsigned long addr) addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } +static DEFINE_PER_CPU(u32, debug_stack_use_ctr); + void debug_stack_set_zero(void) { + this_cpu_inc(debug_stack_use_ctr); load_idt((const struct desc_ptr *)&nmi_idt_descr); } void debug_stack_reset(void) { - load_idt((const struct desc_ptr *)&idt_descr); + if (WARN_ON(!this_cpu_read(debug_stack_use_ctr))) + return; + if (this_cpu_dec_return(debug_stack_use_ctr) == 0) + load_idt((const struct desc_ptr *)&idt_descr); } #else /* CONFIG_X86_64 */ From 5963e317b1e9d2a4511503916d8fd664bb8fa8fb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 May 2012 11:54:53 -0400 Subject: [PATCH 5/9] ftrace/x86: Do not change stacks in DEBUG when calling lockdep When both DYNAMIC_FTRACE and LOCKDEP are set, the TRACE_IRQS_ON/OFF will call into the lockdep code. The lockdep code can call lots of functions that may be traced by ftrace. When ftrace is updating its code and hits a breakpoint, the breakpoint handler will call into lockdep. If lockdep happens to call a function that also has a breakpoint attached, it will jump back into the breakpoint handler resetting the stack to the debug stack and corrupt the contents currently on that stack. The 'do_sym' call that calls do_int3() is protected by modifying the IST table to point to a different location if another breakpoint is hit. But the TRACE_IRQS_OFF/ON are outside that protection, and if a breakpoint is hit from those, the stack will get corrupted, and the kernel will crash: [ 1013.243754] BUG: unable to handle kernel NULL pointer dereference at 0000000000000002 [ 1013.272665] IP: [] 0xffff880145cbffff [ 1013.285186] PGD 1401b2067 PUD 14324c067 PMD 0 [ 1013.298832] Oops: 0010 [#1] PREEMPT SMP [ 1013.310600] CPU 2 [ 1013.317904] Modules linked in: ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables crc32c_intel ghash_clmulni_intel microcode usb_debug serio_raw pcspkr iTCO_wdt i2c_i801 iTCO_vendor_support e1000e nfsd nfs_acl auth_rpcgss lockd sunrpc i915 video i2c_algo_bit drm_kms_helper drm i2c_core [last unloaded: scsi_wait_scan] [ 1013.401848] [ 1013.407399] Pid: 112, comm: kworker/2:1 Not tainted 3.4.0+ #30 [ 1013.437943] RIP: 8eb8:[] [] 0xffff880146309fff [ 1013.459871] RSP: ffffffff8165e919:ffff88014780f408 EFLAGS: 00010046 [ 1013.477909] RAX: 0000000000000001 RBX: ffffffff81104020 RCX: 0000000000000000 [ 1013.499458] RDX: ffff880148008ea8 RSI: ffffffff8131ef40 RDI: ffffffff82203b20 [ 1013.521612] RBP: ffffffff81005751 R08: 0000000000000000 R09: 0000000000000000 [ 1013.543121] R10: ffffffff82cdc318 R11: 0000000000000000 R12: ffff880145cc0000 [ 1013.564614] R13: ffff880148008eb8 R14: 0000000000000002 R15: ffff88014780cb40 [ 1013.586108] FS: 0000000000000000(0000) GS:ffff880148000000(0000) knlGS:0000000000000000 [ 1013.609458] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1013.627420] CR2: 0000000000000002 CR3: 0000000141f10000 CR4: 00000000001407e0 [ 1013.649051] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1013.670724] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 1013.692376] Process kworker/2:1 (pid: 112, threadinfo ffff88013fe0e000, task ffff88014020a6a0) [ 1013.717028] Stack: [ 1013.724131] ffff88014780f570 ffff880145cc0000 0000400000004000 0000000000000000 [ 1013.745918] cccccccccccccccc ffff88014780cca8 ffffffff811072bb ffffffff81651627 [ 1013.767870] ffffffff8118f8a7 ffffffff811072bb ffffffff81f2b6c5 ffffffff81f11bdb [ 1013.790021] Call Trace: [ 1013.800701] Code: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a d7 64 81 ff ff ff ff 01 00 00 00 00 00 00 00 65 d9 64 81 ff [ 1013.861443] RIP [] 0xffff880146309fff [ 1013.884466] RSP [ 1013.901507] CR2: 0000000000000002 The solution was to reuse the NMI functions that change the IDT table to make the debug stack keep its current stack (in kernel mode) when hitting a breakpoint: call debug_stack_set_zero TRACE_IRQS_ON call debug_stack_reset If the TRACE_IRQS_ON happens to hit a breakpoint then it will keep the current stack and not crash the box. Reported-by: Dave Jones Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 44 +++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 320852d02026..7d65133b51be 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -190,6 +190,44 @@ ENDPROC(native_usergs_sysret64) #endif .endm +/* + * When dynamic function tracer is enabled it will add a breakpoint + * to all locations that it is about to modify, sync CPUs, update + * all the code, sync CPUs, then remove the breakpoints. In this time + * if lockdep is enabled, it might jump back into the debug handler + * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF). + * + * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to + * make sure the stack pointer does not get reset back to the top + * of the debug stack, and instead just reuses the current stack. + */ +#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS) + +.macro TRACE_IRQS_OFF_DEBUG + call debug_stack_set_zero + TRACE_IRQS_OFF + call debug_stack_reset +.endm + +.macro TRACE_IRQS_ON_DEBUG + call debug_stack_set_zero + TRACE_IRQS_ON + call debug_stack_reset +.endm + +.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET + bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + jnc 1f + TRACE_IRQS_ON_DEBUG +1: +.endm + +#else +# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF +# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON +# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ +#endif + /* * C code is not supposed to know about undefined top of stack. Every time * a C function with an pt_regs argument is called from the SYSCALL based @@ -1098,7 +1136,7 @@ ENTRY(\sym) subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 call save_paranoid - TRACE_IRQS_OFF + TRACE_IRQS_OFF_DEBUG movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) @@ -1393,7 +1431,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip) ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF + TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_restore testl $3,CS(%rsp) @@ -1404,7 +1442,7 @@ paranoid_swapgs: RESTORE_ALL 8 jmp irq_return paranoid_restore: - TRACE_IRQS_IRETQ 0 + TRACE_IRQS_IRETQ_DEBUG 0 RESTORE_ALL 8 jmp irq_return paranoid_userspace: From 30dc0d0fe5d08396dbdaa2d70972149131340960 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Thu, 15 Mar 2012 19:13:25 +0000 Subject: [PATCH 6/9] x86, efi: Only close open files in error path The loop at the 'close_handles' label in handle_ramdisks() should be using 'i', which represents the number of initrd files that were successfully opened, not 'nr_initrds' which is the number of initrd= arguments passed on the command line. Currently, if we execute the loop to close all file handles and we failed to open any initrds we'll try to call the close function on a garbage pointer, causing the machine to hang. Cc: Matthew Garrett Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1331907517-3985-2-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 2c14e76bb4c7..52a4e667b258 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -674,7 +674,7 @@ free_initrd_total: low_free(initrd_total, initrd_addr); close_handles: - for (k = j; k < nr_initrds; k++) + for (k = j; k < i; k++) efi_call_phys1(fh->close, initrds[k].handle); free_initrds: efi_call_phys1(sys_table->boottime->free_pool, initrds); From 9fa7dedad3d30345c843bd82db02c4d6169e5f61 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Mon, 20 Feb 2012 13:20:59 +0000 Subject: [PATCH 7/9] x86, efi; Add EFI boot stub console support We need a way of printing useful messages to the user, for example when we fail to open an initrd file, instead of just hanging the machine without giving the user any indication of what went wrong. So sprinkle some error messages throughout the EFI boot stub code to make it easier for users to diagnose/report problems. Reported-by: Keshav P R Cc: Matthew Garrett Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1331907517-3985-3-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/eboot.c | 85 ++++++++++++++++++++++++++------ arch/x86/boot/compressed/eboot.h | 6 +++ 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index 52a4e667b258..4e85f5f85837 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -16,6 +16,26 @@ static efi_system_table_t *sys_table; +static void efi_printk(char *str) +{ + char *s8; + + for (s8 = str; *s8; s8++) { + struct efi_simple_text_output_protocol *out; + efi_char16_t ch[2] = { 0 }; + + ch[0] = *s8; + out = (struct efi_simple_text_output_protocol *)sys_table->con_out; + + if (*s8 == '\n') { + efi_char16_t nl[2] = { '\r', 0 }; + efi_call_phys2(out->output_string, out, nl); + } + + efi_call_phys2(out->output_string, out, ch); + } +} + static efi_status_t __get_map(efi_memory_desc_t **map, unsigned long *map_size, unsigned long *desc_size) { @@ -531,8 +551,10 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, EFI_LOADER_DATA, nr_initrds * sizeof(*initrds), &initrds); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for initrds\n"); goto fail; + } str = (char *)(unsigned long)hdr->cmd_line_ptr; for (i = 0; i < nr_initrds; i++) { @@ -575,32 +597,42 @@ static efi_status_t handle_ramdisks(efi_loaded_image_t *image, status = efi_call_phys3(boottime->handle_protocol, image->device_handle, &fs_proto, &io); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to handle fs_proto\n"); goto free_initrds; + } status = efi_call_phys2(io->open_volume, io, &fh); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to open volume\n"); goto free_initrds; + } } status = efi_call_phys5(fh->open, fh, &h, filename_16, EFI_FILE_MODE_READ, (u64)0); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to open initrd file\n"); goto close_handles; + } initrd->handle = h; info_sz = 0; status = efi_call_phys4(h->get_info, h, &info_guid, &info_sz, NULL); - if (status != EFI_BUFFER_TOO_SMALL) + if (status != EFI_BUFFER_TOO_SMALL) { + efi_printk("Failed to get initrd info size\n"); goto close_handles; + } grow: status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, info_sz, &info); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for initrd info\n"); goto close_handles; + } status = efi_call_phys4(h->get_info, h, &info_guid, &info_sz, info); @@ -612,8 +644,10 @@ grow: file_sz = info->file_size; efi_call_phys1(sys_table->boottime->free_pool, info); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to get initrd info\n"); goto close_handles; + } initrd->size = file_sz; initrd_total += file_sz; @@ -629,11 +663,14 @@ grow: */ status = high_alloc(initrd_total, 0x1000, &initrd_addr, hdr->initrd_addr_max); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc highmem for initrds\n"); goto close_handles; + } /* We've run out of free low memory. */ if (initrd_addr > hdr->initrd_addr_max) { + efi_printk("We've run out of free low memory\n"); status = EFI_INVALID_PARAMETER; goto free_initrd_total; } @@ -652,8 +689,10 @@ grow: status = efi_call_phys3(fh->read, initrds[j].handle, &chunksize, addr); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to read initrd\n"); goto free_initrd_total; + } addr += chunksize; size -= chunksize; } @@ -732,8 +771,10 @@ static efi_status_t make_boot_params(struct boot_params *boot_params, options_size++; /* NUL termination */ status = low_alloc(options_size, 1, &cmdline); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for cmdline\n"); goto fail; + } s1 = (u8 *)(unsigned long)cmdline; s2 = (u16 *)options; @@ -895,12 +936,16 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) status = efi_call_phys3(sys_table->boottime->handle_protocol, handle, &proto, (void *)&image); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n"); goto fail; + } status = low_alloc(0x4000, 1, (unsigned long *)&boot_params); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc lowmem for boot params\n"); goto fail; + } memset(boot_params, 0x0, 0x4000); @@ -933,8 +978,10 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) if (status != EFI_SUCCESS) { status = low_alloc(hdr->init_size, hdr->kernel_alignment, &start); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for kernel\n"); goto fail; + } } hdr->code32_start = (__u32)start; @@ -945,19 +992,25 @@ struct boot_params *efi_main(void *handle, efi_system_table_t *_table) status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for gdt structure\n"); goto fail; + } gdt->size = 0x800; status = low_alloc(gdt->size, 8, (unsigned long *)&gdt->address); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for gdt\n"); goto fail; + } status = efi_call_phys3(sys_table->boottime->allocate_pool, EFI_LOADER_DATA, sizeof(*idt), (void **)&idt); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk("Failed to alloc mem for idt structure\n"); goto fail; + } idt->size = 0; idt->address = 0; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index 39251663e65b..3b6e15627c55 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -58,4 +58,10 @@ struct efi_uga_draw_protocol { void *blt; }; +struct efi_simple_text_output_protocol { + void *reset; + void *output_string; + void *test_string; +}; + #endif /* BOOT_COMPRESSED_EBOOT_H */ From 0c7596621e313bfcfbacb288e768c7150f5de9e0 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 16 Mar 2012 12:03:13 +0000 Subject: [PATCH 8/9] x86, efi: Add EFI boot stub documentation Since we can't expect every user to read the EFI boot stub code it seems prudent to have a couple of paragraphs explaining what it is and how it works. The "initrd=" option in particular is tricky because it only understands absolute EFI-style paths (backslashes as directory separators), and until now this hasn't been documented anywhere. This has tripped up a couple of users. Cc: Matthew Garrett Cc: Randy Dunlap Signed-off-by: Matt Fleming Link: http://lkml.kernel.org/r/1331907517-3985-4-git-send-email-matt@console-pimps.org Signed-off-by: H. Peter Anvin --- Documentation/x86/efi-stub.txt | 65 ++++++++++++++++++++++++++++++++++ arch/x86/Kconfig | 2 ++ 2 files changed, 67 insertions(+) create mode 100644 Documentation/x86/efi-stub.txt diff --git a/Documentation/x86/efi-stub.txt b/Documentation/x86/efi-stub.txt new file mode 100644 index 000000000000..44e6bb6ead10 --- /dev/null +++ b/Documentation/x86/efi-stub.txt @@ -0,0 +1,65 @@ + The EFI Boot Stub + --------------------------- + +On the x86 platform, a bzImage can masquerade as a PE/COFF image, +thereby convincing EFI firmware loaders to load it as an EFI +executable. The code that modifies the bzImage header, along with the +EFI-specific entry point that the firmware loader jumps to are +collectively known as the "EFI boot stub", and live in +arch/x86/boot/header.S and arch/x86/boot/compressed/eboot.c, +respectively. + +By using the EFI boot stub it's possible to boot a Linux kernel +without the use of a conventional EFI boot loader, such as grub or +elilo. Since the EFI boot stub performs the jobs of a boot loader, in +a certain sense it *IS* the boot loader. + +The EFI boot stub is enabled with the CONFIG_EFI_STUB kernel option. + + +**** How to install bzImage.efi + +The bzImage located in arch/x86/boot/bzImage must be copied to the EFI +System Partiion (ESP) and renamed with the extension ".efi". Without +the extension the EFI firmware loader will refuse to execute it. It's +not possible to execute bzImage.efi from the usual Linux file systems +because EFI firmware doesn't have support for them. + + +**** Passing kernel parameters from the EFI shell + +Arguments to the kernel can be passed after bzImage.efi, e.g. + + fs0:> bzImage.efi console=ttyS0 root=/dev/sda4 + + +**** The "initrd=" option + +Like most boot loaders, the EFI stub allows the user to specify +multiple initrd files using the "initrd=" option. This is the only EFI +stub-specific command line parameter, everything else is passed to the +kernel when it boots. + +The path to the initrd file must be an absolute path from the +beginning of the ESP, relative path names do not work. Also, the path +is an EFI-style path and directory elements must be separated with +backslashes (\). For example, given the following directory layout, + +fs0:> + Kernels\ + bzImage.efi + initrd-large.img + + Ramdisks\ + initrd-small.img + initrd-medium.img + +to boot with the initrd-large.img file if the current working +directory is fs0:\Kernels, the following command must be used, + + fs0:\Kernels> bzImage.efi initrd=\Kernels\initrd-large.img + +Notice how bzImage.efi can be specified with a relative path. That's +because the image we're executing is interpreted by the EFI shell, +which understands relative paths, whereas the rest of the command line +is passed to bzImage.efi. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d700811785ea..c70684f859e1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1506,6 +1506,8 @@ config EFI_STUB This kernel feature allows a bzImage to be loaded directly by EFI firmware without the use of a bootloader. + See Documentation/x86/efi-stub.txt for more information. + config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" From bad1a753d4d4deb09d4bc0bac1dd4fc3298502e9 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Mon, 21 May 2012 20:29:45 -0700 Subject: [PATCH 9/9] x86, x32, ptrace: Remove PTRACE_ARCH_PRCTL for x32 When I added x32 ptrace to 3.4 kernel, I also include PTRACE_ARCH_PRCTL support for x32 GDB For ARCH_GET_FS/GS, it takes a pointer to int64. But at user level, ARCH_GET_FS/GS takes a pointer to int32. So I have to add x32 ptrace to glibc to handle it with a temporary int64 passed to kernel and copy it back to GDB as int32. Roland suggested that PTRACE_ARCH_PRCTL is obsolete and x32 GDB should use fs_base and gs_base fields of user_regs_struct instead. Accordingly, remove PTRACE_ARCH_PRCTL completely from the x32 code to avoid possible memory overrun when pointer to int32 is passed to kernel. Link: http://lkml.kernel.org/r/CAMe9rOpDzHfS7NH7m1vmD9QRw8SSj4Sc%2BaNOgcWm_WJME2eRsQ@mail.gmail.com Signed-off-by: H. Peter Anvin Cc: v3.4 --- arch/x86/kernel/ptrace.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 13b1990c7c58..c4c6a5c2bf0f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1211,12 +1211,6 @@ static long x32_arch_ptrace(struct task_struct *child, 0, sizeof(struct user_i387_struct), datap); - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - return do_arch_prctl(child, data, addr); - default: return compat_ptrace_request(child, request, addr, data); }