Merge branch 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS update from Ingo Molnar: "The changes in this tree are: - ACPI APEI (ACPI Platform Error Interface) improvements, by Chen Gong - misc MCE fixes/cleanups" * 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Update MCE severity condition check mce: acpi/apei: Add comments to clarify usage of the various bitfields in the MCA subsystem ACPI/APEI: Update einj documentation for param1/param2 ACPI/APEI: Add parameter check before error injection ACPI, APEI, EINJ: Fix error return code in einj_init() x86, mce: Fix "braodcast" typo
This commit is contained in:
commit
3045f94a20
8 changed files with 68 additions and 19 deletions
|
@ -47,11 +47,16 @@ directory apei/einj. The following files are provided.
|
|||
|
||||
- param1
|
||||
This file is used to set the first error parameter value. Effect of
|
||||
parameter depends on error_type specified.
|
||||
parameter depends on error_type specified. For example, if error
|
||||
type is memory related type, the param1 should be a valid physical
|
||||
memory address.
|
||||
|
||||
- param2
|
||||
This file is used to set the second error parameter value. Effect of
|
||||
parameter depends on error_type specified.
|
||||
parameter depends on error_type specified. For example, if error
|
||||
type is memory related type, the param2 should be a physical memory
|
||||
address mask. Linux requires page or narrower granularity, say,
|
||||
0xfffffffffffff000.
|
||||
|
||||
- notrigger
|
||||
The EINJ mechanism is a two step process. First inject the error, then
|
||||
|
|
|
@ -61,7 +61,7 @@
|
|||
#define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */
|
||||
#define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */
|
||||
#define MCJ_EXCEPTION 0x8 /* raise as exception */
|
||||
#define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */
|
||||
#define MCJ_IRQ_BROADCAST 0x10 /* do IRQ broadcasting */
|
||||
|
||||
#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
|
||||
|
||||
|
|
|
@ -153,7 +153,7 @@ static void raise_mce(struct mce *m)
|
|||
return;
|
||||
|
||||
#ifdef CONFIG_X86_LOCAL_APIC
|
||||
if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) {
|
||||
if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
|
||||
unsigned long start;
|
||||
int cpu;
|
||||
|
||||
|
@ -167,7 +167,7 @@ static void raise_mce(struct mce *m)
|
|||
cpumask_clear_cpu(cpu, mce_inject_cpumask);
|
||||
}
|
||||
if (!cpumask_empty(mce_inject_cpumask)) {
|
||||
if (m->inject_flags & MCJ_IRQ_BRAODCAST) {
|
||||
if (m->inject_flags & MCJ_IRQ_BROADCAST) {
|
||||
/*
|
||||
* don't wait because mce_irq_ipi is necessary
|
||||
* to be sync with following raise_local
|
||||
|
|
|
@ -110,22 +110,17 @@ static struct severity {
|
|||
/* known AR MCACODs: */
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
MCESEV(
|
||||
KEEP, "HT thread notices Action required: data load error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
|
||||
MCGMASK(MCG_STATUS_EIPV, 0)
|
||||
KEEP, "Action required but unaffected thread is continuable",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR),
|
||||
MCGMASK(MCG_STATUS_RIPV, MCG_STATUS_RIPV)
|
||||
),
|
||||
MCESEV(
|
||||
AR, "Action required: data load error",
|
||||
AR, "Action required: data load error in a user process",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
|
||||
USER
|
||||
),
|
||||
MCESEV(
|
||||
KEEP, "HT thread notices Action required: instruction fetch error",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
|
||||
MCGMASK(MCG_STATUS_EIPV, 0)
|
||||
),
|
||||
MCESEV(
|
||||
AR, "Action required: instruction fetch error",
|
||||
AR, "Action required: instruction fetch error in a user process",
|
||||
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
|
||||
USER
|
||||
),
|
||||
|
|
|
@ -89,7 +89,10 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
|
|||
static DEFINE_PER_CPU(struct mce, mces_seen);
|
||||
static int cpu_missing;
|
||||
|
||||
/* MCA banks polled by the period polling timer for corrected events */
|
||||
/*
|
||||
* MCA banks polled by the period polling timer for corrected events.
|
||||
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
|
||||
*/
|
||||
DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
|
||||
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
|
||||
};
|
||||
|
|
|
@ -24,6 +24,18 @@
|
|||
* Also supports reliable discovery of shared banks.
|
||||
*/
|
||||
|
||||
/*
|
||||
* CMCI can be delivered to multiple cpus that share a machine check bank
|
||||
* so we need to designate a single cpu to process errors logged in each bank
|
||||
* in the interrupt handler (otherwise we would have many races and potential
|
||||
* double reporting of the same error).
|
||||
* Note that this can change when a cpu is offlined or brought online since
|
||||
* some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
|
||||
* disables CMCI on all banks owned by the cpu and clears this bitfield. At
|
||||
* this point, cmci_rediscover() kicks in and a different cpu may end up
|
||||
* taking ownership of some of the shared MCA banks that were previously
|
||||
* owned by the offlined cpu.
|
||||
*/
|
||||
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
|
||||
|
||||
/*
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
#include <linux/seq_file.h>
|
||||
#include <linux/nmi.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/mm.h>
|
||||
#include <acpi/acpi.h>
|
||||
|
||||
#include "apei-internal.h"
|
||||
|
@ -41,6 +42,10 @@
|
|||
#define SPIN_UNIT 100 /* 100ns */
|
||||
/* Firmware should respond within 1 milliseconds */
|
||||
#define FIRMWARE_TIMEOUT (1 * NSEC_PER_MSEC)
|
||||
#define ACPI5_VENDOR_BIT BIT(31)
|
||||
#define MEM_ERROR_MASK (ACPI_EINJ_MEMORY_CORRECTABLE | \
|
||||
ACPI_EINJ_MEMORY_UNCORRECTABLE | \
|
||||
ACPI_EINJ_MEMORY_FATAL)
|
||||
|
||||
/*
|
||||
* ACPI version 5 provides a SET_ERROR_TYPE_WITH_ADDRESS action.
|
||||
|
@ -367,7 +372,7 @@ static int __einj_error_trigger(u64 trigger_paddr, u32 type,
|
|||
* This will cause resource conflict with regular memory. So
|
||||
* remove it from trigger table resources.
|
||||
*/
|
||||
if ((param_extension || acpi5) && (type & 0x0038) && param2) {
|
||||
if ((param_extension || acpi5) && (type & MEM_ERROR_MASK) && param2) {
|
||||
struct apei_resources addr_resources;
|
||||
apei_resources_init(&addr_resources);
|
||||
trigger_param_region = einj_get_trigger_parameter_region(
|
||||
|
@ -427,7 +432,7 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)
|
|||
struct set_error_type_with_address *v5param = einj_param;
|
||||
|
||||
v5param->type = type;
|
||||
if (type & 0x80000000) {
|
||||
if (type & ACPI5_VENDOR_BIT) {
|
||||
switch (vendor_flags) {
|
||||
case SETWA_FLAGS_APICID:
|
||||
v5param->apicid = param1;
|
||||
|
@ -512,7 +517,34 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)
|
|||
static int einj_error_inject(u32 type, u64 param1, u64 param2)
|
||||
{
|
||||
int rc;
|
||||
unsigned long pfn;
|
||||
|
||||
/*
|
||||
* We need extra sanity checks for memory errors.
|
||||
* Other types leap directly to injection.
|
||||
*/
|
||||
|
||||
/* ensure param1/param2 existed */
|
||||
if (!(param_extension || acpi5))
|
||||
goto inject;
|
||||
|
||||
/* ensure injection is memory related */
|
||||
if (type & ACPI5_VENDOR_BIT) {
|
||||
if (vendor_flags != SETWA_FLAGS_MEM)
|
||||
goto inject;
|
||||
} else if (!(type & MEM_ERROR_MASK))
|
||||
goto inject;
|
||||
|
||||
/*
|
||||
* Disallow crazy address masks that give BIOS leeway to pick
|
||||
* injection address almost anywhere. Insist on page or
|
||||
* better granularity and that target address is normal RAM.
|
||||
*/
|
||||
pfn = PFN_DOWN(param1 & param2);
|
||||
if (!page_is_ram(pfn) || ((param2 & PAGE_MASK) != PAGE_MASK))
|
||||
return -EINVAL;
|
||||
|
||||
inject:
|
||||
mutex_lock(&einj_mutex);
|
||||
rc = __einj_error_inject(type, param1, param2);
|
||||
mutex_unlock(&einj_mutex);
|
||||
|
@ -590,7 +622,7 @@ static int error_type_set(void *data, u64 val)
|
|||
* Vendor defined types have 0x80000000 bit set, and
|
||||
* are not enumerated by ACPI_EINJ_GET_ERROR_TYPE
|
||||
*/
|
||||
vendor = val & 0x80000000;
|
||||
vendor = val & ACPI5_VENDOR_BIT;
|
||||
tval = val & 0x7fffffff;
|
||||
|
||||
/* Only one error type can be specified */
|
||||
|
@ -694,6 +726,7 @@ static int __init einj_init(void)
|
|||
if (rc)
|
||||
goto err_release;
|
||||
|
||||
rc = -ENOMEM;
|
||||
einj_param = einj_get_parameter_address();
|
||||
if ((param_extension || acpi5) && einj_param) {
|
||||
fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR,
|
||||
|
|
|
@ -409,6 +409,7 @@ int __weak page_is_ram(unsigned long pfn)
|
|||
{
|
||||
return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(page_is_ram);
|
||||
|
||||
void __weak arch_remove_reservations(struct resource *avail)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue