drm/i915: clear up wedged transitions
We have two important transitions of the wedged state in the current code: - 0 -> 1: This means a hang has been detected, and signals to everyone that they please get of any locks, so that the reset work item can do its job. - 1 -> 0: The reset handler has completed. Now the last transition mixes up two states: "Reset completed and successful" and "Reset failed". To distinguish these two we do some tricks with the reset completion, but I simply could not convince myself that this doesn't race under odd circumstances. Hence split this up, and add a new terminal state indicating that the hw is gone for good. Also add explicit #defines for both states, update comments. v2: Split out the reset handling bugfix for the throttle ioctl. v3: s/tmp/wedged/ sugested by Chris Wilson. Also fixup up a rebase error which prevented this patch from actually compiling. v4: To unify the wedged state with the reset counter, keep the reset-in-progress state just as a flag. The terminally-wedged state is now denoted with a big number. v5: Add a comment to the reset_counter special values explaining that WEDGED & RESET_IN_PROGRESS needs to be true for the code to be correct. v6: Fixup logic errors introduced with the wedged+reset_counter unification. Since WEDGED implies reset-in-progress (in a way we're terminally stuck in the dead-but-reset-not-completed state), we need ensure that we check for this everywhere. The specific bug was in wait_for_error, which would simply have timed out. v7: Extract an inline i915_reset_in_progress helper to make the code more readable. Also annote the reset-in-progress case with an unlikely, to help the compiler optimize the fastpath. Do the same for the terminally wedged case with i915_terminally_wedged. Reviewed-by: Damien Lespiau <damien.lespiau@intel.com> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
This commit is contained in:
parent
308887aad1
commit
1f83fee08d
5 changed files with 74 additions and 44 deletions
|
@ -1672,7 +1672,7 @@ i915_wedged_read(struct file *filp,
|
|||
|
||||
len = snprintf(buf, sizeof(buf),
|
||||
"wedged : %d\n",
|
||||
atomic_read(&dev_priv->gpu_error.wedged));
|
||||
atomic_read(&dev_priv->gpu_error.reset_counter));
|
||||
|
||||
if (len > sizeof(buf))
|
||||
len = sizeof(buf);
|
||||
|
|
|
@ -771,11 +771,37 @@ struct i915_gpu_error {
|
|||
/* Protected by the above dev->gpu_error.lock. */
|
||||
struct drm_i915_error_state *first_error;
|
||||
struct work_struct work;
|
||||
struct completion completion;
|
||||
|
||||
unsigned long last_reset;
|
||||
|
||||
atomic_t wedged;
|
||||
/**
|
||||
* State variable controlling the reset flow
|
||||
*
|
||||
* Upper bits are for the reset counter.
|
||||
*
|
||||
* Lowest bit controls the reset state machine: Set means a reset is in
|
||||
* progress. This state will (presuming we don't have any bugs) decay
|
||||
* into either unset (successful reset) or the special WEDGED value (hw
|
||||
* terminally sour). All waiters on the reset_queue will be woken when
|
||||
* that happens.
|
||||
*/
|
||||
atomic_t reset_counter;
|
||||
|
||||
/**
|
||||
* Special values/flags for reset_counter
|
||||
*
|
||||
* Note that the code relies on
|
||||
* I915_WEDGED & I915_RESET_IN_PROGRESS_FLAG
|
||||
* being true.
|
||||
*/
|
||||
#define I915_RESET_IN_PROGRESS_FLAG 1
|
||||
#define I915_WEDGED 0xffffffff
|
||||
|
||||
/**
|
||||
* Waitqueue to signal when the reset has completed. Used by clients
|
||||
* that wait for dev_priv->mm.wedged to settle.
|
||||
*/
|
||||
wait_queue_head_t reset_queue;
|
||||
|
||||
/* For gpu hang simulation. */
|
||||
unsigned int stop_rings;
|
||||
|
@ -1543,6 +1569,16 @@ void i915_gem_retire_requests(struct drm_device *dev);
|
|||
void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
|
||||
int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
|
||||
bool interruptible);
|
||||
static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
|
||||
{
|
||||
return unlikely(atomic_read(&error->reset_counter)
|
||||
& I915_RESET_IN_PROGRESS_FLAG);
|
||||
}
|
||||
|
||||
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
|
||||
{
|
||||
return atomic_read(&error->reset_counter) == I915_WEDGED;
|
||||
}
|
||||
|
||||
void i915_gem_reset(struct drm_device *dev);
|
||||
void i915_gem_clflush_object(struct drm_i915_gem_object *obj);
|
||||
|
|
|
@ -89,36 +89,32 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
|
|||
static int
|
||||
i915_gem_wait_for_error(struct i915_gpu_error *error)
|
||||
{
|
||||
struct completion *x = &error->completion;
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
if (!atomic_read(&error->wedged))
|
||||
#define EXIT_COND (!i915_reset_in_progress(error))
|
||||
if (EXIT_COND)
|
||||
return 0;
|
||||
|
||||
/* GPU is already declared terminally dead, give up. */
|
||||
if (i915_terminally_wedged(error))
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
* Only wait 10 seconds for the gpu reset to complete to avoid hanging
|
||||
* userspace. If it takes that long something really bad is going on and
|
||||
* we should simply try to bail out and fail as gracefully as possible.
|
||||
*/
|
||||
ret = wait_for_completion_interruptible_timeout(x, 10*HZ);
|
||||
ret = wait_event_interruptible_timeout(error->reset_queue,
|
||||
EXIT_COND,
|
||||
10*HZ);
|
||||
if (ret == 0) {
|
||||
DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
|
||||
return -EIO;
|
||||
} else if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
#undef EXIT_COND
|
||||
|
||||
if (atomic_read(&error->wedged)) {
|
||||
/* GPU is hung, bump the completion count to account for
|
||||
* the token we just consumed so that we never hit zero and
|
||||
* end up waiting upon a subsequent completion event that
|
||||
* will never happen.
|
||||
*/
|
||||
spin_lock_irqsave(&x->wait.lock, flags);
|
||||
x->done++;
|
||||
spin_unlock_irqrestore(&x->wait.lock, flags);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -942,23 +938,14 @@ int
|
|||
i915_gem_check_wedge(struct i915_gpu_error *error,
|
||||
bool interruptible)
|
||||
{
|
||||
if (atomic_read(&error->wedged)) {
|
||||
struct completion *x = &error->completion;
|
||||
bool recovery_complete;
|
||||
unsigned long flags;
|
||||
|
||||
/* Give the error handler a chance to run. */
|
||||
spin_lock_irqsave(&x->wait.lock, flags);
|
||||
recovery_complete = x->done > 0;
|
||||
spin_unlock_irqrestore(&x->wait.lock, flags);
|
||||
|
||||
if (i915_reset_in_progress(error)) {
|
||||
/* Non-interruptible callers can't handle -EAGAIN, hence return
|
||||
* -EIO unconditionally for these. */
|
||||
if (!interruptible)
|
||||
return -EIO;
|
||||
|
||||
/* Recovery complete, but still wedged means reset failure. */
|
||||
if (recovery_complete)
|
||||
/* Recovery complete, but the reset failed ... */
|
||||
if (i915_terminally_wedged(error))
|
||||
return -EIO;
|
||||
|
||||
return -EAGAIN;
|
||||
|
@ -1025,7 +1012,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
|
|||
|
||||
#define EXIT_COND \
|
||||
(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
|
||||
atomic_read(&dev_priv->gpu_error.wedged))
|
||||
i915_reset_in_progress(&dev_priv->gpu_error))
|
||||
do {
|
||||
if (interruptible)
|
||||
end = wait_event_interruptible_timeout(ring->irq_queue,
|
||||
|
@ -1379,7 +1366,7 @@ out:
|
|||
/* If this -EIO is due to a gpu hang, give the reset code a
|
||||
* chance to clean up the mess. Otherwise return the proper
|
||||
* SIGBUS. */
|
||||
if (!atomic_read(&dev_priv->gpu_error.wedged))
|
||||
if (i915_terminally_wedged(&dev_priv->gpu_error))
|
||||
return VM_FAULT_SIGBUS;
|
||||
case -EAGAIN:
|
||||
/* Give the error handler a chance to run and move the
|
||||
|
@ -3983,9 +3970,9 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
|
|||
if (drm_core_check_feature(dev, DRIVER_MODESET))
|
||||
return 0;
|
||||
|
||||
if (atomic_read(&dev_priv->gpu_error.wedged)) {
|
||||
if (i915_reset_in_progress(&dev_priv->gpu_error)) {
|
||||
DRM_ERROR("Reenabling wedged hardware, good luck\n");
|
||||
atomic_set(&dev_priv->gpu_error.wedged, 0);
|
||||
atomic_set(&dev_priv->gpu_error.reset_counter, 0);
|
||||
}
|
||||
|
||||
mutex_lock(&dev->struct_mutex);
|
||||
|
@ -4069,7 +4056,7 @@ i915_gem_load(struct drm_device *dev)
|
|||
INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
|
||||
INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
|
||||
i915_gem_retire_work_handler);
|
||||
init_completion(&dev_priv->gpu_error.completion);
|
||||
init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
|
||||
|
||||
/* On GEN3 we really need to make sure the ARB C3 LP bit is set */
|
||||
if (IS_GEN3(dev)) {
|
||||
|
|
|
@ -862,8 +862,10 @@ done:
|
|||
*/
|
||||
static void i915_error_work_func(struct work_struct *work)
|
||||
{
|
||||
drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
|
||||
gpu_error.work);
|
||||
struct i915_gpu_error *error = container_of(work, struct i915_gpu_error,
|
||||
work);
|
||||
drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
|
||||
gpu_error);
|
||||
struct drm_device *dev = dev_priv->dev;
|
||||
char *error_event[] = { "ERROR=1", NULL };
|
||||
char *reset_event[] = { "RESET=1", NULL };
|
||||
|
@ -871,14 +873,18 @@ static void i915_error_work_func(struct work_struct *work)
|
|||
|
||||
kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
|
||||
|
||||
if (atomic_read(&dev_priv->gpu_error.wedged)) {
|
||||
if (i915_reset_in_progress(error)) {
|
||||
DRM_DEBUG_DRIVER("resetting chip\n");
|
||||
kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);
|
||||
|
||||
if (!i915_reset(dev)) {
|
||||
atomic_set(&dev_priv->gpu_error.wedged, 0);
|
||||
atomic_set(&error->reset_counter, 0);
|
||||
kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);
|
||||
} else {
|
||||
atomic_set(&error->reset_counter, I915_WEDGED);
|
||||
}
|
||||
complete_all(&dev_priv->gpu_error.completion);
|
||||
|
||||
wake_up_all(&dev_priv->gpu_error.reset_queue);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1482,11 +1488,12 @@ void i915_handle_error(struct drm_device *dev, bool wedged)
|
|||
i915_report_and_clear_eir(dev);
|
||||
|
||||
if (wedged) {
|
||||
INIT_COMPLETION(dev_priv->gpu_error.completion);
|
||||
atomic_set(&dev_priv->gpu_error.wedged, 1);
|
||||
atomic_set(&dev_priv->gpu_error.reset_counter,
|
||||
I915_RESET_IN_PROGRESS_FLAG);
|
||||
|
||||
/*
|
||||
* Wakeup waiting processes so they don't hang
|
||||
* Wakeup waiting processes so that the reset work item
|
||||
* doesn't deadlock trying to grab various locks.
|
||||
*/
|
||||
for_each_ring(ring, dev_priv, i)
|
||||
wake_up_all(&ring->irq_queue);
|
||||
|
|
|
@ -2223,7 +2223,7 @@ intel_finish_fb(struct drm_framebuffer *old_fb)
|
|||
WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue));
|
||||
|
||||
wait_event(dev_priv->pending_flip_queue,
|
||||
atomic_read(&dev_priv->gpu_error.wedged) ||
|
||||
i915_reset_in_progress(&dev_priv->gpu_error) ||
|
||||
atomic_read(&obj->pending_flip) == 0);
|
||||
|
||||
/* Big Hammer, we also need to ensure that any pending
|
||||
|
@ -2871,7 +2871,7 @@ static bool intel_crtc_has_pending_flip(struct drm_crtc *crtc)
|
|||
unsigned long flags;
|
||||
bool pending;
|
||||
|
||||
if (atomic_read(&dev_priv->gpu_error.wedged))
|
||||
if (i915_reset_in_progress(&dev_priv->gpu_error))
|
||||
return false;
|
||||
|
||||
spin_lock_irqsave(&dev->event_lock, flags);
|
||||
|
|
Loading…
Reference in a new issue