Merge branch 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86/MCE update from Ingo Molnar: "Various MCE robustness enhancements. One of the changes adds CMCI (Corrected Machine Check Interrupt) poll mode on Intel Nehalem+ CPUs, which mode is automatically entered when the rate of messages is too high - and exited once the storm is over. An MCE events storm will roughly look like this: [ 5342.740616] mce: [Hardware Error]: Machine check events logged [ 5342.746501] mce: [Hardware Error]: Machine check events logged [ 5342.757971] CMCI storm detected: switching to poll mode [ 5372.674957] CMCI storm subsided: switching to interrupt mode This should make such events more survivable" * 'x86-mce-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mce: Provide boot argument to honour bios-set CMCI threshold x86, MCE: Remove unused defines x86, mce: Enable MCA support by default x86/mce: Add CMCI poll mode x86/mce: Make cmci_discover() quiet x86: mce: Remove the frozen cases in the hotplug code x86: mce: Split timer init x86: mce: Serialize mce injection x86: mce: Disable preemption when calling raise_local()
This commit is contained in:
commit
7687b80a4f
7 changed files with 249 additions and 64 deletions
|
@ -50,6 +50,13 @@ Machine check
|
|||
monarchtimeout:
|
||||
Sets the time in us to wait for other CPUs on machine checks. 0
|
||||
to disable.
|
||||
mce=bios_cmci_threshold
|
||||
Don't overwrite the bios-set CMCI threshold. This boot option
|
||||
prevents Linux from overwriting the CMCI threshold set by the
|
||||
bios. Without this option, Linux always sets the CMCI
|
||||
threshold to 1. Enabling this may make memory predictive failure
|
||||
analysis less effective if the bios sets thresholds for memory
|
||||
errors since we will not see details for all errors.
|
||||
|
||||
nomce (for compatibility with i386): same as mce=off
|
||||
|
||||
|
|
|
@ -874,6 +874,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
|
|||
|
||||
config X86_MCE
|
||||
bool "Machine Check / overheating reporting"
|
||||
default y
|
||||
---help---
|
||||
Machine Check support allows the processor to notify the
|
||||
kernel if it detects a problem (e.g. overheating, data corruption).
|
||||
|
|
|
@ -116,19 +116,9 @@ struct mce_log {
|
|||
/* Software defined banks */
|
||||
#define MCE_EXTENDED_BANK 128
|
||||
#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0
|
||||
|
||||
#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
|
||||
#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9)
|
||||
#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9)
|
||||
#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9)
|
||||
#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9)
|
||||
#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9)
|
||||
#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
|
||||
#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
|
||||
|
||||
#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1)
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
extern void mce_register_decode_chain(struct notifier_block *nb);
|
||||
extern void mce_unregister_decode_chain(struct notifier_block *nb);
|
||||
|
||||
|
@ -171,6 +161,7 @@ DECLARE_PER_CPU(struct device *, mce_device);
|
|||
#ifdef CONFIG_X86_MCE_INTEL
|
||||
extern int mce_cmci_disabled;
|
||||
extern int mce_ignore_ce;
|
||||
extern int mce_bios_cmci_threshold;
|
||||
void mce_intel_feature_init(struct cpuinfo_x86 *c);
|
||||
void cmci_clear(void);
|
||||
void cmci_reenable(void);
|
||||
|
|
|
@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
|
|||
}
|
||||
|
||||
static cpumask_var_t mce_inject_cpumask;
|
||||
static DEFINE_MUTEX(mce_inject_mutex);
|
||||
|
||||
static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)
|
||||
{
|
||||
|
@ -194,7 +195,11 @@ static void raise_mce(struct mce *m)
|
|||
put_online_cpus();
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
preempt_disable();
|
||||
raise_local();
|
||||
preempt_enable();
|
||||
}
|
||||
}
|
||||
|
||||
/* Error injection interface */
|
||||
|
@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
|
|||
* so do it a jiffie or two later everywhere.
|
||||
*/
|
||||
schedule_timeout(2);
|
||||
|
||||
mutex_lock(&mce_inject_mutex);
|
||||
raise_mce(&m);
|
||||
mutex_unlock(&mce_inject_mutex);
|
||||
return usize;
|
||||
}
|
||||
|
||||
|
|
|
@ -28,6 +28,18 @@ extern int mce_ser;
|
|||
|
||||
extern struct mce_bank *mce_banks;
|
||||
|
||||
#ifdef CONFIG_X86_MCE_INTEL
|
||||
unsigned long mce_intel_adjust_timer(unsigned long interval);
|
||||
void mce_intel_cmci_poll(void);
|
||||
void mce_intel_hcpu_update(unsigned long cpu);
|
||||
#else
|
||||
# define mce_intel_adjust_timer mce_adjust_timer_default
|
||||
static inline void mce_intel_cmci_poll(void) { }
|
||||
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
|
||||
#endif
|
||||
|
||||
void mce_timer_kick(unsigned long interval);
|
||||
|
||||
#ifdef CONFIG_ACPI_APEI
|
||||
int apei_write_mce(struct mce *m);
|
||||
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
|
||||
|
|
|
@ -83,6 +83,7 @@ static int mce_dont_log_ce __read_mostly;
|
|||
int mce_cmci_disabled __read_mostly;
|
||||
int mce_ignore_ce __read_mostly;
|
||||
int mce_ser __read_mostly;
|
||||
int mce_bios_cmci_threshold __read_mostly;
|
||||
|
||||
struct mce_bank *mce_banks __read_mostly;
|
||||
|
||||
|
@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */
|
|||
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
|
||||
static DEFINE_PER_CPU(struct timer_list, mce_timer);
|
||||
|
||||
static unsigned long mce_adjust_timer_default(unsigned long interval)
|
||||
{
|
||||
return interval;
|
||||
}
|
||||
|
||||
static unsigned long (*mce_adjust_timer)(unsigned long interval) =
|
||||
mce_adjust_timer_default;
|
||||
|
||||
static void mce_timer_fn(unsigned long data)
|
||||
{
|
||||
struct timer_list *t = &__get_cpu_var(mce_timer);
|
||||
|
@ -1276,6 +1285,7 @@ static void mce_timer_fn(unsigned long data)
|
|||
if (mce_available(__this_cpu_ptr(&cpu_info))) {
|
||||
machine_check_poll(MCP_TIMESTAMP,
|
||||
&__get_cpu_var(mce_poll_banks));
|
||||
mce_intel_cmci_poll();
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1283,15 +1293,39 @@ static void mce_timer_fn(unsigned long data)
|
|||
* polling interval, otherwise increase the polling interval.
|
||||
*/
|
||||
iv = __this_cpu_read(mce_next_interval);
|
||||
if (mce_notify_irq())
|
||||
if (mce_notify_irq()) {
|
||||
iv = max(iv / 2, (unsigned long) HZ/100);
|
||||
else
|
||||
} else {
|
||||
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
|
||||
iv = mce_adjust_timer(iv);
|
||||
}
|
||||
__this_cpu_write(mce_next_interval, iv);
|
||||
|
||||
/* Might have become 0 after CMCI storm subsided */
|
||||
if (iv) {
|
||||
t->expires = jiffies + iv;
|
||||
add_timer_on(t, smp_processor_id());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure that the timer is firing in @interval from now.
|
||||
*/
|
||||
void mce_timer_kick(unsigned long interval)
|
||||
{
|
||||
struct timer_list *t = &__get_cpu_var(mce_timer);
|
||||
unsigned long when = jiffies + interval;
|
||||
unsigned long iv = __this_cpu_read(mce_next_interval);
|
||||
|
||||
if (timer_pending(t)) {
|
||||
if (time_before(when, t->expires))
|
||||
mod_timer_pinned(t, when);
|
||||
} else {
|
||||
t->expires = round_jiffies(when);
|
||||
add_timer_on(t, smp_processor_id());
|
||||
}
|
||||
if (interval < iv)
|
||||
__this_cpu_write(mce_next_interval, interval);
|
||||
}
|
||||
|
||||
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
|
||||
static void mce_timer_delete_all(void)
|
||||
|
@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
|
|||
switch (c->x86_vendor) {
|
||||
case X86_VENDOR_INTEL:
|
||||
mce_intel_feature_init(c);
|
||||
mce_adjust_timer = mce_intel_adjust_timer;
|
||||
break;
|
||||
case X86_VENDOR_AMD:
|
||||
mce_amd_feature_init(c);
|
||||
|
@ -1594,21 +1629,26 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
|
|||
}
|
||||
}
|
||||
|
||||
static void mce_start_timer(unsigned int cpu, struct timer_list *t)
|
||||
{
|
||||
unsigned long iv = mce_adjust_timer(check_interval * HZ);
|
||||
|
||||
__this_cpu_write(mce_next_interval, iv);
|
||||
|
||||
if (mce_ignore_ce || !iv)
|
||||
return;
|
||||
|
||||
t->expires = round_jiffies(jiffies + iv);
|
||||
add_timer_on(t, smp_processor_id());
|
||||
}
|
||||
|
||||
static void __mcheck_cpu_init_timer(void)
|
||||
{
|
||||
struct timer_list *t = &__get_cpu_var(mce_timer);
|
||||
unsigned long iv = check_interval * HZ;
|
||||
unsigned int cpu = smp_processor_id();
|
||||
|
||||
setup_timer(t, mce_timer_fn, smp_processor_id());
|
||||
|
||||
if (mce_ignore_ce)
|
||||
return;
|
||||
|
||||
__this_cpu_write(mce_next_interval, iv);
|
||||
if (!iv)
|
||||
return;
|
||||
t->expires = round_jiffies(jiffies + iv);
|
||||
add_timer_on(t, smp_processor_id());
|
||||
setup_timer(t, mce_timer_fn, cpu);
|
||||
mce_start_timer(cpu, t);
|
||||
}
|
||||
|
||||
/* Handle unconfigured int18 (should never happen) */
|
||||
|
@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = {
|
|||
* check, or 0 to not wait
|
||||
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
|
||||
* mce=nobootlog Don't log MCEs from before booting.
|
||||
* mce=bios_cmci_threshold Don't program the CMCI threshold
|
||||
*/
|
||||
static int __init mcheck_enable(char *str)
|
||||
{
|
||||
|
@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str)
|
|||
mce_ignore_ce = 1;
|
||||
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
|
||||
mce_bootlog = (str[0] == 'b');
|
||||
else if (!strcmp(str, "bios_cmci_threshold"))
|
||||
mce_bios_cmci_threshold = 1;
|
||||
else if (isdigit(str[0])) {
|
||||
get_option(&str, &tolerant);
|
||||
if (*str == ',') {
|
||||
|
@ -2166,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {
|
|||
&mce_cmci_disabled
|
||||
};
|
||||
|
||||
static struct dev_ext_attribute dev_attr_bios_cmci_threshold = {
|
||||
__ATTR(bios_cmci_threshold, 0444, device_show_int, NULL),
|
||||
&mce_bios_cmci_threshold
|
||||
};
|
||||
|
||||
static struct device_attribute *mce_device_attrs[] = {
|
||||
&dev_attr_tolerant.attr,
|
||||
&dev_attr_check_interval.attr,
|
||||
|
@ -2174,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = {
|
|||
&dev_attr_dont_log_ce.attr,
|
||||
&dev_attr_ignore_ce.attr,
|
||||
&dev_attr_cmci_disabled.attr,
|
||||
&dev_attr_bios_cmci_threshold.attr,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
@ -2294,38 +2343,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
|
|||
unsigned int cpu = (unsigned long)hcpu;
|
||||
struct timer_list *t = &per_cpu(mce_timer, cpu);
|
||||
|
||||
switch (action) {
|
||||
switch (action & ~CPU_TASKS_FROZEN) {
|
||||
case CPU_ONLINE:
|
||||
case CPU_ONLINE_FROZEN:
|
||||
mce_device_create(cpu);
|
||||
if (threshold_cpu_callback)
|
||||
threshold_cpu_callback(action, cpu);
|
||||
break;
|
||||
case CPU_DEAD:
|
||||
case CPU_DEAD_FROZEN:
|
||||
if (threshold_cpu_callback)
|
||||
threshold_cpu_callback(action, cpu);
|
||||
mce_device_remove(cpu);
|
||||
mce_intel_hcpu_update(cpu);
|
||||
break;
|
||||
case CPU_DOWN_PREPARE:
|
||||
case CPU_DOWN_PREPARE_FROZEN:
|
||||
del_timer_sync(t);
|
||||
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
|
||||
del_timer_sync(t);
|
||||
break;
|
||||
case CPU_DOWN_FAILED:
|
||||
case CPU_DOWN_FAILED_FROZEN:
|
||||
if (!mce_ignore_ce && check_interval) {
|
||||
t->expires = round_jiffies(jiffies +
|
||||
per_cpu(mce_next_interval, cpu));
|
||||
add_timer_on(t, cpu);
|
||||
}
|
||||
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
|
||||
mce_start_timer(cpu, t);
|
||||
break;
|
||||
case CPU_POST_DEAD:
|
||||
}
|
||||
|
||||
if (action == CPU_POST_DEAD) {
|
||||
/* intentionally ignoring frozen here */
|
||||
cmci_rediscover(cpu);
|
||||
break;
|
||||
}
|
||||
|
||||
return NOTIFY_OK;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
#include <asm/msr.h>
|
||||
#include <asm/mce.h>
|
||||
|
||||
#include "mce-internal.h"
|
||||
|
||||
/*
|
||||
* Support for Intel Correct Machine Check Interrupts. This allows
|
||||
* the CPU to raise an interrupt when a corrected machine check happened.
|
||||
|
@ -31,6 +33,21 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
|
|||
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
|
||||
|
||||
#define CMCI_THRESHOLD 1
|
||||
#define CMCI_POLL_INTERVAL (30 * HZ)
|
||||
#define CMCI_STORM_INTERVAL (1 * HZ)
|
||||
#define CMCI_STORM_THRESHOLD 15
|
||||
|
||||
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
|
||||
static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
|
||||
static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
|
||||
|
||||
enum {
|
||||
CMCI_STORM_NONE,
|
||||
CMCI_STORM_ACTIVE,
|
||||
CMCI_STORM_SUBSIDED,
|
||||
};
|
||||
|
||||
static atomic_t cmci_storm_on_cpus;
|
||||
|
||||
static int cmci_supported(int *banks)
|
||||
{
|
||||
|
@ -53,6 +70,93 @@ static int cmci_supported(int *banks)
|
|||
return !!(cap & MCG_CMCI_P);
|
||||
}
|
||||
|
||||
void mce_intel_cmci_poll(void)
|
||||
{
|
||||
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
|
||||
return;
|
||||
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
|
||||
}
|
||||
|
||||
void mce_intel_hcpu_update(unsigned long cpu)
|
||||
{
|
||||
if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
|
||||
atomic_dec(&cmci_storm_on_cpus);
|
||||
|
||||
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
|
||||
}
|
||||
|
||||
unsigned long mce_intel_adjust_timer(unsigned long interval)
|
||||
{
|
||||
int r;
|
||||
|
||||
if (interval < CMCI_POLL_INTERVAL)
|
||||
return interval;
|
||||
|
||||
switch (__this_cpu_read(cmci_storm_state)) {
|
||||
case CMCI_STORM_ACTIVE:
|
||||
/*
|
||||
* We switch back to interrupt mode once the poll timer has
|
||||
* silenced itself. That means no events recorded and the
|
||||
* timer interval is back to our poll interval.
|
||||
*/
|
||||
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
|
||||
r = atomic_sub_return(1, &cmci_storm_on_cpus);
|
||||
if (r == 0)
|
||||
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
|
||||
/* FALLTHROUGH */
|
||||
|
||||
case CMCI_STORM_SUBSIDED:
|
||||
/*
|
||||
* We wait for all cpus to go back to SUBSIDED
|
||||
* state. When that happens we switch back to
|
||||
* interrupt mode.
|
||||
*/
|
||||
if (!atomic_read(&cmci_storm_on_cpus)) {
|
||||
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
|
||||
cmci_reenable();
|
||||
cmci_recheck();
|
||||
}
|
||||
return CMCI_POLL_INTERVAL;
|
||||
default:
|
||||
/*
|
||||
* We have shiny weather. Let the poll do whatever it
|
||||
* thinks.
|
||||
*/
|
||||
return interval;
|
||||
}
|
||||
}
|
||||
|
||||
static bool cmci_storm_detect(void)
|
||||
{
|
||||
unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
|
||||
unsigned long ts = __this_cpu_read(cmci_time_stamp);
|
||||
unsigned long now = jiffies;
|
||||
int r;
|
||||
|
||||
if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
|
||||
return true;
|
||||
|
||||
if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
|
||||
cnt++;
|
||||
} else {
|
||||
cnt = 1;
|
||||
__this_cpu_write(cmci_time_stamp, now);
|
||||
}
|
||||
__this_cpu_write(cmci_storm_cnt, cnt);
|
||||
|
||||
if (cnt <= CMCI_STORM_THRESHOLD)
|
||||
return false;
|
||||
|
||||
cmci_clear();
|
||||
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
|
||||
r = atomic_add_return(1, &cmci_storm_on_cpus);
|
||||
mce_timer_kick(CMCI_POLL_INTERVAL);
|
||||
|
||||
if (r == 1)
|
||||
pr_notice("CMCI storm detected: switching to poll mode\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* The interrupt handler. This is called on every event.
|
||||
* Just call the poller directly to log any events.
|
||||
|
@ -61,33 +165,28 @@ static int cmci_supported(int *banks)
|
|||
*/
|
||||
static void intel_threshold_interrupt(void)
|
||||
{
|
||||
if (cmci_storm_detect())
|
||||
return;
|
||||
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
|
||||
mce_notify_irq();
|
||||
}
|
||||
|
||||
static void print_update(char *type, int *hdr, int num)
|
||||
{
|
||||
if (*hdr == 0)
|
||||
printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
|
||||
*hdr = 1;
|
||||
printk(KERN_CONT " %s:%d", type, num);
|
||||
}
|
||||
|
||||
/*
|
||||
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
|
||||
* on this CPU. Use the algorithm recommended in the SDM to discover shared
|
||||
* banks.
|
||||
*/
|
||||
static void cmci_discover(int banks, int boot)
|
||||
static void cmci_discover(int banks)
|
||||
{
|
||||
unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
|
||||
unsigned long flags;
|
||||
int hdr = 0;
|
||||
int i;
|
||||
int bios_wrong_thresh = 0;
|
||||
|
||||
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
|
||||
for (i = 0; i < banks; i++) {
|
||||
u64 val;
|
||||
int bios_zero_thresh = 0;
|
||||
|
||||
if (test_bit(i, owned))
|
||||
continue;
|
||||
|
@ -96,29 +195,52 @@ static void cmci_discover(int banks, int boot)
|
|||
|
||||
/* Already owned by someone else? */
|
||||
if (val & MCI_CTL2_CMCI_EN) {
|
||||
if (test_and_clear_bit(i, owned) && !boot)
|
||||
print_update("SHD", &hdr, i);
|
||||
clear_bit(i, owned);
|
||||
__clear_bit(i, __get_cpu_var(mce_poll_banks));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!mce_bios_cmci_threshold) {
|
||||
val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
|
||||
val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
|
||||
val |= CMCI_THRESHOLD;
|
||||
} else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
|
||||
/*
|
||||
* If bios_cmci_threshold boot option was specified
|
||||
* but the threshold is zero, we'll try to initialize
|
||||
* it to 1.
|
||||
*/
|
||||
bios_zero_thresh = 1;
|
||||
val |= CMCI_THRESHOLD;
|
||||
}
|
||||
|
||||
val |= MCI_CTL2_CMCI_EN;
|
||||
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
|
||||
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
|
||||
|
||||
/* Did the enable bit stick? -- the bank supports CMCI */
|
||||
if (val & MCI_CTL2_CMCI_EN) {
|
||||
if (!test_and_set_bit(i, owned) && !boot)
|
||||
print_update("CMCI", &hdr, i);
|
||||
set_bit(i, owned);
|
||||
__clear_bit(i, __get_cpu_var(mce_poll_banks));
|
||||
/*
|
||||
* We are able to set thresholds for some banks that
|
||||
* had a threshold of 0. This means the BIOS has not
|
||||
* set the thresholds properly or does not work with
|
||||
* this boot option. Note down now and report later.
|
||||
*/
|
||||
if (mce_bios_cmci_threshold && bios_zero_thresh &&
|
||||
(val & MCI_CTL2_CMCI_THRESHOLD_MASK))
|
||||
bios_wrong_thresh = 1;
|
||||
} else {
|
||||
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
|
||||
}
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
|
||||
if (hdr)
|
||||
printk(KERN_CONT "\n");
|
||||
if (mce_bios_cmci_threshold && bios_wrong_thresh) {
|
||||
pr_info_once(
|
||||
"bios_cmci_threshold: Some banks do not have valid thresholds set\n");
|
||||
pr_info_once(
|
||||
"bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -156,7 +278,7 @@ void cmci_clear(void)
|
|||
continue;
|
||||
/* Disable CMCI */
|
||||
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
|
||||
val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
|
||||
val &= ~MCI_CTL2_CMCI_EN;
|
||||
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
|
||||
__clear_bit(i, __get_cpu_var(mce_banks_owned));
|
||||
}
|
||||
|
@ -186,7 +308,7 @@ void cmci_rediscover(int dying)
|
|||
continue;
|
||||
/* Recheck banks in case CPUs don't all have the same */
|
||||
if (cmci_supported(&banks))
|
||||
cmci_discover(banks, 0);
|
||||
cmci_discover(banks);
|
||||
}
|
||||
|
||||
set_cpus_allowed_ptr(current, old);
|
||||
|
@ -200,7 +322,7 @@ void cmci_reenable(void)
|
|||
{
|
||||
int banks;
|
||||
if (cmci_supported(&banks))
|
||||
cmci_discover(banks, 0);
|
||||
cmci_discover(banks);
|
||||
}
|
||||
|
||||
static void intel_init_cmci(void)
|
||||
|
@ -211,7 +333,7 @@ static void intel_init_cmci(void)
|
|||
return;
|
||||
|
||||
mce_threshold_vector = intel_threshold_interrupt;
|
||||
cmci_discover(banks, 1);
|
||||
cmci_discover(banks);
|
||||
/*
|
||||
* For CPU #0 this runs with still disabled APIC, but that's
|
||||
* ok because only the vector is set up. We still do another
|
||||
|
|
Loading…
Reference in a new issue